In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Restaurant_Reviews.tsv', delimiter='\t' , quoting=3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
def pre_process(text):
    # lowercase
    text = text.lower()
    # tags
    text = re.sub('&lt;/?.*?&gt;',' &lt;&gt; ',text)
    # special characters and digits
    text=re.sub('(\\d|\\W)+',' ',text)
    
    return text

df['Review'] = df['Review'].apply(lambda x:pre_process(x))
df.head()

Unnamed: 0,Review,Liked
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [None]:
#Vectorization
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
cv.fit(df['Review'])
X = cv.transform(df['Review'])

y = df['Liked']

In [None]:
#Build Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80)

In [None]:
#Find the best value of C in logistic regression
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print('Accuracy for C=%s: %s'
         % (c, accuracy_score(y_test, lr.predict(X_test))))

Accuracy for C=0.01: 0.735
Accuracy for C=0.05: 0.745
Accuracy for C=0.25: 0.76
Accuracy for C=0.5: 0.765
Accuracy for C=1: 0.765


In [None]:
#Here I choose C=1 to build the final model.
final_model = LogisticRegression(C=1)
final_model.fit(X, y)
print('Final Model Accuracy: %s' %accuracy_score(y_test, final_model.predict(X_test)))

Final Model Accuracy: 0.925


In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, final_model.predict(X_test))   #Confusion Matrix

array([[94,  7],
       [ 8, 91]])

In [None]:
94+7+8+91     #Menjumlahkan semua matrix

200

In [None]:
(94+91)/200    #menambahkan atas kiri dan bawah kanan lalu dibagi sejumlah matrix

0.925