In [0]:
#Natural Language Processing
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
#Importing Dataset
dataset = pd.read_csv('gdrive/My Drive/Data/Restaurant_Reviews.tsv',delimiter = '\t', quoting = 3)

#Cleaning the sets
**Steps**

1.   Keep only alphabets
2.   Convert to lowercase
3.   Convert string to list
4.   Remove stopwords
5.   Stemming (e.g. Convert loved, loving to love)
6.   Convert list back to string

In [0]:
#Cleaning the sets
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 

corpus = [] #Corpus is a collection of texts of same type
for i in range(0, 1000):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #Kept only alphabets
  review = review.lower() #to lower case
  review = review.split() #string to list
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] #Removing stopwords like the, a, this
  review = ' '.join(review)
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# Creating the Bag of Words Model
# Tokenization is process of taking all different words of the review and creating column for each words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [0]:
#We can use dimensionality reduction to reduce the number of dimensions

#Split dataset into train-test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

#FeatureScaling
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Fitting Naive Bayes to the dataset, Accuracy = 71%
'''
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
'''

#Fitting Decision Trees to the dataset, Accuracy = 71%
'''
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) 
'''

#Fitting Random Forest Classifier to the dataset, Accuracy = 72%

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) 


#Fitting Logistic Regression to the Training set, Accuracy = 75%
'''
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
'''
#Fitting k Nearest Neighbors to training set, Accuracy = 65%
'''
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
'''

#Fitting SVM to the dataset, Accuracy = 69%
'''
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
'''

#Fitting kernel SVM to the dataset, Accuracy = 73%
'''
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
'''

classifier.fit(X_train, y_train)

#Predicting test set results
y_pred = classifier.predict(X_test)

In [0]:
#Making the Confusion Matrix 
'''     Predicted
          0   1
Actual 0  TN  FP
Actual 1  FN  TP
'''
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
tn = cm[0][0]
fp = cm[0][1]
fn = cm[1][0]
tp = cm[1][1]
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2*precision*recall/(precision+recall)
print('Accuracy is',accuracy)
print('Precision is',precision)
print('Recall is',recall)
print('F1 Score is',f1)

[[87 10]
 [46 57]]
Accuracy is 0.72
Precision is 0.8507462686567164
Recall is 0.5533980582524272
F1 Score is 0.6705882352941177


In [0]:
#Combining three models to get more accuracy
Model used: RandomForest, DecisionTree and NaiveBayes

'''
def build_ensemble_classifier(X, y):
    classifier1 = RandomForestClassifier()
    classifier1.fit(X, y) 
    
    classifier2 = DecisionTreeClassifier()
    classifier2.fit(X, y)
 
    classifier3 = GaussianNB()
    classifier3.fit(X, y)
    
    return [classifier1, classifier2, classifier3]
 
def ensemble_classifier_predict(X, classifiers):
    results = [classifier.predict(X) for classifier in classifiers]
    processed_results = []
    
    for i in range(len(results[0])):
        votes = []
        
        for j in range(len(results)):
            votes.append(results[j][i])
            
        processed_results.append(decide_voting(votes))
                
    return np.array(processed_results)
 
def decide_voting(votes):
    zeros = 0
    ones = 0
    
    for vote in votes:
        if vote == 0:
            zeros += 1
        else:
            ones += 1
    
    return 1 if ones > zeros else 0
 
def get_model_accuracy(cm):
    return (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
 
classifiers = build_ensemble_classifier(X_train, y_train)
cm = confusion_matrix(y_test, ensemble_classifier_predict(X_test, classifiers))
 
print(f'accuracy: {get_model_accuracy(cm)}')
print(cm)
'''
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

def models(X, y):
    clf1 = RandomForestClassifier()
    clf1.fit(X, y)
    
    clf2 = GaussianNB()
    clf2.fit(X, y)
    
    clf3 = DecisionTreeClassifier()
    clf3.fit(X, y)
    
    clf4 = LogisticRegression()
    clf4.fit(X, y)
    
    clf5 = KNeighborsClassifier()
    clf5.fit(X, y)
    
    clf6 = SVC()
    clf6.fit(X, y)

    return[clf1, clf2, clf3, clf4, clf5, clf6]

def prediction(X, clf):
    result = [classifier.predict(X) for classifier in clf]
   # print(result)
    y_pred = []
    for i in range(len(result[0])):
        zeros = 0
        ones = 0
        for j in range(len(result)):
            if result[j][i] == 0:
                zeros += 1
            else:
                ones += 1
        if zeros != 6 & zeros != 0:        
            print(zeros)        
        if ones >= zeros:
            y_pred.append(1)
        else:
            y_pred.append(0)

clf = models(X_train, y_train)
prediction(X_test, clf)
'''

"\nHello, I just naively combined multiple models into one based on voting and it produces more than 0.8 accuracy.\n\nModel used: RandomForest, DecisionTree and NaiveBayes\n\ndef build_ensemble_classifier(X, y):\n    classifier1 = RandomForestClassifier()\n    classifier1.fit(X, y) \n    \n    classifier2 = DecisionTreeClassifier()\n    classifier2.fit(X, y)\n \n    classifier3 = GaussianNB()\n    classifier3.fit(X, y)\n    \n    return [classifier1, classifier2, classifier3]\n \ndef ensemble_classifier_predict(X, classifiers):\n    results = [classifier.predict(X) for classifier in classifiers]\n    processed_results = []\n    \n    for i in range(len(results[0])):\n        votes = []\n        \n        for j in range(len(results)):\n            votes.append(results[j][i])\n            \n        processed_results.append(decide_voting(votes))\n                \n    return np.array(processed_results)\n \ndef decide_voting(votes):\n    zeros = 0\n    ones = 0\n    \n    for vote in votes