# Natural Language Processing

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
dataset.shape

(1000, 2)

## Cleaning the texts

In [4]:
import re # Regular expressions
import nltk # Natural language ToolKit

In [5]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [6]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [7]:
corpus = []
for i in range(1000):
    review = re.sub('[^a-zA-Z]',' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
type(corpus)

list

In [9]:
len(corpus)

1000

In [10]:
corpus[0]

'wow love place'

In [11]:
corpus[1]

'crust good'

## Creating the Bag of words model

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

In [17]:
X.shape

(1000, 1500)

## Splitting the dataset 

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 0)

In [24]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (800, 1500)
X_test shape: (200, 1500)


## Classifiers and predictions

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# pip install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

**Decision Tree**

In [26]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_dt = dt.predict(X_test)

In [28]:
print(classification_report(y_test,y_dt))

              precision    recall  f1-score   support

           0       0.63      0.74      0.68        97
           1       0.71      0.58      0.64       103

    accuracy                           0.66       200
   macro avg       0.67      0.66      0.66       200
weighted avg       0.67      0.66      0.66       200



**K Nearest Neighbors**

In [29]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_knn = knn.predict(X_test)

In [31]:
print(classification_report(y_test,y_knn))

              precision    recall  f1-score   support

           0       0.56      0.72      0.63        97
           1       0.64      0.46      0.53       103

    accuracy                           0.58       200
   macro avg       0.60      0.59      0.58       200
weighted avg       0.60      0.58      0.58       200



**Naive Bayes**

In [32]:
nb = GaussianNB()
nb.fit(X_train,y_train)
y_nb = nb.predict(X_test)

In [33]:
print(classification_report(y_test,y_nb))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



**XGBoost**

In [34]:
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_xgb = xgb.predict(X_test)



In [35]:
print(classification_report(y_test,y_xgb))

              precision    recall  f1-score   support

           0       0.67      0.87      0.75        97
           1       0.82      0.59      0.69       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.72      0.72       200

