<a href="https://colab.research.google.com/github/Vezhani/BelajarBigData/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [30]:
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t', quoting=3)

In [31]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [32]:
from sklearn.model_selection import train_test_split
X = df['Review']
y = df['Liked']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [33]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def text_process(document):
    document = re.sub('[^a-zA-Z]', ' ', document)
    document = document.lower()
    document = document.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    document = [word for word in document if not word in set(all_stopwords)]
    ps = PorterStemmer()
    document = [ps.stem(word) for word in document]
    return document

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
rf_param_grid = {
    'bag_of_words__ngram_range': [(1, 1)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__criterion': ['gini'],
    'estimator__n_estimators': [100, 300]
}

nb_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__alpha': [0.01, 1.0]
}

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

rf_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process)),
    ('tf_idf', TfidfTransformer()),
    ('estimator', RandomForestClassifier())
])

nb_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process)),
    ('tf_idf', TfidfTransformer()),
    ('estimator', MultinomialNB())
])

In [36]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(rf_pipe, rf_param_grid, verbose=2, cv=2)
rf_grid.fit(X_train, y_train)

nb_grid = GridSearchCV(nb_pipe, nb_param_grid, verbose=2, cv=2)
nb_grid.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=100; total time=   0.4s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=100; total time=   0.4s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=300; total time=   0.8s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=300; total time=   0.8s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__criterion=gini, estimator__n_estimators=100; total time=   0.4s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngr



[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.3s




[CV] END bag_of_words__max_df=0.85, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.01, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=0.01; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s
[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 1), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=0.01; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s




[CV] END bag_of_words__max_df=1.0, bag_of_words__min_df=0.05, bag_of_words__ngram_range=(1, 2), estimator__alpha=1.0; total time=   0.2s


In [37]:
rf_grid.best_params_

{'bag_of_words__max_df': 0.85,
 'bag_of_words__min_df': 0.01,
 'bag_of_words__ngram_range': (1, 1),
 'estimator__criterion': 'gini',
 'estimator__n_estimators': 100}

In [38]:
nb_grid.best_params_

{'bag_of_words__max_df': 0.85,
 'bag_of_words__min_df': 0.01,
 'bag_of_words__ngram_range': (1, 1),
 'estimator__alpha': 1.0}

In [39]:
rf_y_pred = rf_grid.predict(X_test)
nb_y_pred = nb_grid.predict(X_test)

In [40]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test, rf_y_pred).ravel()
print('Confusion matrix:\n', metrics.confusion_matrix(y_test, rf_y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, rf_y_pred))
print('Precision:', metrics.precision_score(y_test, rf_y_pred))
print('Recall:', metrics.recall_score(y_test, rf_y_pred))
print('F1-Score:', metrics.f1_score(y_test, rf_y_pred))
print(metrics.classification_report(y_test, rf_y_pred))

Confusion matrix:
 [[92 22]
 [48 88]]
Accuracy: 0.72
Precision: 0.8
Recall: 0.6470588235294118
F1-Score: 0.7154471544715447
              precision    recall  f1-score   support

           0       0.66      0.81      0.72       114
           1       0.80      0.65      0.72       136

    accuracy                           0.72       250
   macro avg       0.73      0.73      0.72       250
weighted avg       0.73      0.72      0.72       250



In [41]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test, nb_y_pred).ravel()
print('Confusion matrix:\n', metrics.confusion_matrix(y_test, nb_y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, nb_y_pred))
print('Precision:', metrics.precision_score(y_test, nb_y_pred))
print('Recall:', metrics.recall_score(y_test, nb_y_pred))
print('F1-Score:', metrics.f1_score(y_test, nb_y_pred))
print(metrics.classification_report(y_test, nb_y_pred))

Confusion matrix:
 [[89 25]
 [47 89]]
Accuracy: 0.712
Precision: 0.7807017543859649
Recall: 0.6544117647058824
F1-Score: 0.7120000000000001
              precision    recall  f1-score   support

           0       0.65      0.78      0.71       114
           1       0.78      0.65      0.71       136

    accuracy                           0.71       250
   macro avg       0.72      0.72      0.71       250
weighted avg       0.72      0.71      0.71       250



In [42]:
rf_grid.best_params_

{'bag_of_words__max_df': 0.85,
 'bag_of_words__min_df': 0.01,
 'bag_of_words__ngram_range': (1, 1),
 'estimator__criterion': 'gini',
 'estimator__n_estimators': 100}

In [43]:
rf_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process, max_df=1.0, min_df=0.01, ngram_range=(1,1))),
    ('tf_idf', TfidfTransformer()),
    ('estimator', RandomForestClassifier(n_estimators=300, criterion='gini'))
])

rf_pipe.fit(X_train, y_train)

In [45]:
feature_importance = pd.DataFrame(rf_pipe.steps[2][1].feature_importances_,
                                  rf_pipe.steps[0][1].get_feature_names_out(),
                                  columns=['importance'])
feature_importance.sort_values('importance', ascending=False).head(20)

Unnamed: 0,importance
great,0.076694
not,0.060838
good,0.047942
delici,0.031407
love,0.030635
amaz,0.025927
place,0.023569
nice,0.023222
friendli,0.019765
food,0.01802
