# NLP for Sentiment analysis

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


### Importing dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [3]:
len(dataset)

1000

In [9]:
dataset.iloc[0,0]

'Wow... Loved this place.'

### Cleaning the texts
##### Lower all texts, remove punctuations, apply regular expression syntax, remove stopwords like determinants/articles, apply stemming

In [25]:
import re
import nltk
# nltk.download('stopwords') No need to download as it takes time
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [32]:
print(stopwords.words('english').remove('not'))

None


In [50]:
corpus = []
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i][0])  # pattern, replace_with, source_string
    review = review.lower().split()
    ps = PorterStemmer()

    # Some words included in stopwords might be required by our corpus as per the context, hence remove them from the stopwords
    custom_stopwords = stopwords.words('english')
    custom_stopwords.remove('not')
    review = [ps.stem(word) for word in review if word not in set(custom_stopwords)]
    corpus.append(' '.join(review))


In [36]:
print(corpus[0:10:1])

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch']


### Create Bag of Words Model

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

# We need a parameter max_features which will use only those words that appear frequently
# To get it we first will check how many total words we have after fitting all corpus, and then will decide from that 
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()  # we need a sparse matrix hence convert to toarray()
y = dataset.iloc[:,-1].values

In [48]:
len(X[0])
# This number denotes all the words in first review. All words are denoted in binary, if word present, then 1, else 0.
# Now we can choose any number from this ie 1563 , lets assume max_features needed will be 1500, add it to CountVectorizer constructor and rerun above code

1500

### Split data into train and test set

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

### Training Naive Bayes model

In [54]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB()

### Predict Test results

In [55]:
preds = model.predict(X_test)

### Confusion Matrix

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, preds)
print(cm)
print('Accuracy - ', accuracy_score(y_test, preds))

[[55 32]
 [24 89]]
Accuracy -  0.72


### Predicting for new reviews

In [67]:
# Using previous stuff like ps, custom_stopwords, etc

reviews = ['I understand your filthy kitchen', 'Dirt everywhere', 'Cannot stand longer', 'I like it', 'I love how you managed to make fool of us.']
new_reviews = []
for review in reviews:
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in set(custom_stopwords)]
    new_reviews.append(' '.join(review))

print(new_reviews)

['understand filthi kitchen', 'dirt everywher', 'cannot stand longer', 'like', 'love manag make fool us']


In [68]:
new_X = cv.transform(new_reviews).toarray()
print(model.predict(new_X))

[0 0 0 1 1]
