In [1]:
# Selecting the best classifier for bag of words model

# Text Preprocessing
# Importing the dataset
import pandas as pd
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the texts
import re
import nltk
nltk.download('stopwords') 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset.loc[i,'Review']) 
    review = review.lower() 
    review = review.split() 
    
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Model Selection

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 2020)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
log_classifier = LogisticRegression(random_state=0)
log_classifier.fit(X_train, y_train)

# Euclidean KNN
from sklearn.neighbors import KNeighborsClassifier
euc_knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
euc_knn_classifier.fit(X_train, y_train)

# Manhattan KNN
man_knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
man_knn_classifier.fit(X_train, y_train)

# Chebyshev KNN
che_knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=float('inf'))
che_knn_classifier.fit(X_train, y_train)

# linear SVM
from sklearn.svm import SVC
lin_svm_classifier = SVC(kernel='linear', random_state=0)
lin_svm_classifier.fit(X_train, y_train)

# RBF SVM
rbf_svm_classifier = SVC(kernel='rbf', random_state=0)
rbf_svm_classifier.fit(X_train, y_train)

# polynomial SVM
poly_svm_classifier = SVC(kernel='poly', random_state=0)
poly_svm_classifier.fit(X_train, y_train)

# sigmoid SVM
sigmoid_svm_classifier = SVC(kernel='sigmoid', random_state=0)
sigmoid_svm_classifier.fit(X_train, y_train)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_classifier.fit(X_train, y_train)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [3]:
# Evaluating Model Performances on the test set

log_accuracy = log_classifier.score(X_test, y_test)
euc_knn_accuracy = euc_knn_classifier.score(X_test, y_test)
man_knn_accuracy = man_knn_classifier.score(X_test, y_test)
che_knn_accuracy = che_knn_classifier.score(X_test, y_test)
lin_svm_accuracy = lin_svm_classifier.score(X_test, y_test)
rbf_svm_accuracy = rbf_svm_classifier.score(X_test, y_test)
poly_svm_accuracy = poly_svm_classifier.score(X_test, y_test)
sigmoid_svm_accuracy = sigmoid_svm_classifier.score(X_test, y_test)
nb_accuracy = nb_classifier.score(X_test, y_test)
dt_accuracy = dt_classifier.score(X_test, y_test)
rf_accuracy = rf_classifier.score(X_test, y_test)

print('Accuracy of the models :')
print('Logistic          : ',log_accuracy)
print('Euclidean KNN     : ',euc_knn_accuracy)
print('Manhattan KNN     : ',man_knn_accuracy)
print('Chebyshev KNN     : ',che_knn_accuracy)
print('Linear SVM        : ',lin_svm_accuracy)
print('RBF SVM           : ',rbf_svm_accuracy)
print('Polynomial SVM    : ',poly_svm_accuracy)
print('Sigmoid SVM       : ',sigmoid_svm_accuracy)
print('Naive Bayes       : ',nb_accuracy)
print('Decision Tree     : ',dt_accuracy)
print('Random Forest     : ',rf_accuracy)

Accuracy of the models :
Logistic          :  0.785
Euclidean KNN     :  0.66
Manhattan KNN     :  0.695
Chebyshev KNN     :  0.56
Linear SVM        :  0.78
RBF SVM           :  0.765
Polynomial SVM    :  0.665
Sigmoid SVM       :  0.755
Naive Bayes       :  0.695
Decision Tree     :  0.725
Random Forest     :  0.71
