#MOUNTING THE GOOGLE DRIVE TO COLAB

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#IMPORTING THE DATASET FORM DRIVE

In [None]:
# Importing the libraries
import numpy as np
import pandas as pd


In [None]:
# Importing the dataset
dataset = pd.read_csv('/content/drive/MyDrive/ibmskillbuild project/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [None]:
dataset.head(10)


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [None]:
dataset.sample(10)

#TEXT CLEANING

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import string
from nltk.corpus import stopwords

In [None]:
stopwords.words('english')

In [None]:
[punc for punc in string.punctuation]

In [None]:
def text_process(msg):
  nopunc = [char for char in msg if char not in string.punctuation]
  nopunc = ''.join(nopunc)
  return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

Creating a column for adding the cleaned and Tokenized Review

In [None]:
dataset['tokenized_Review'] = dataset['Review'].apply(text_process)

In [None]:
dataset.head(10)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(max_df=0.9,min_df=10)
X = vectorizer.fit_transform(dataset['tokenized_Review']).toarray()

In [None]:
X


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(dataset['tokenized_Review'],dataset['Liked'],random_state=107,test_size=0.2)

In [None]:
X_train.head()

In [None]:
train_vectorized = vectorizer.transform(X_train)
test_vectorized = vectorizer.transform(X_test)

In [None]:
X_train_array = train_vectorized.toarray()
X_test_array = test_vectorized.toarray()

#GAUSSIAN NB

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train_array,y_train)

In [None]:
y_train_preds_nb = nb.predict(X_train_array)
y_test_preds_nb = nb.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)




In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_nb , y_test,y_test_preds_nb)

ACCURACY SCORE
Training accuracy_score is :  76.38 %
Testing accuracy_score is :  72.5 %


PRECISION SCORE
Training precision_score is :  84.04 %
Testing precision_score is :  81.33 %


RECALL SCORE
Training recall_score is :  64.82 %
Testing recall_score is :  59.8 %


F1 SCORE
Training f1_score is :  73.19 %
Testing f1_score is :  68.93 %


ROC_AUC_SCORE
Training roc_auc_score is :  76.32 %
Testing roc_auc_score is :  72.76 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[353  49]
 [140 258]]
Testing Confusion_matrix is : 
[[84 14]
 [41 61]]


###PREDICT NEW REVIEW WITH THE MODEL

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return nb.predict(temp)


In [None]:
def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review1  = 'Food is Decent bro'
sample_review2 = 'Hmm I like Lovely'
predict_review(sample_review1)
predict_review(sample_review2)


this is a negative review
this is a positive review


#MULTINOMIAL NB

In [None]:
# Multinomial NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=0.1)
mnb.fit(X_train_array,y_train)

In [None]:
y_train_preds_mnb = mnb.predict(X_train_array)
y_test_preds_mnb = mnb.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)

In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_mnb , y_test,y_test_preds_mnb)

ACCURACY SCORE
Training accuracy_score is :  77.0 %
Testing accuracy_score is :  75.5 %


PRECISION SCORE
Training precision_score is :  81.1 %
Testing precision_score is :  79.12 %


RECALL SCORE
Training recall_score is :  70.1 %
Testing recall_score is :  70.59 %


F1 SCORE
Training f1_score is :  75.2 %
Testing f1_score is :  74.61 %


ROC_AUC_SCORE
Training roc_auc_score is :  76.97 %
Testing roc_auc_score is :  75.6 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[337  65]
 [119 279]]
Testing Confusion_matrix is : 
[[79 19]
 [30 72]]


### Hyperparameter Tuning for Multinomial NB Classifier

In [None]:
#hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
	temp_classifier_MNB = MultinomialNB(alpha=i)

	temp_classifier_MNB.fit(X_train_array,y_train)

	temp_y_pred_MNB = temp_classifier_MNB.predict(X_test_array)

	score_MNB = accuracy_score(y_test,temp_y_pred_MNB)

	print("Accuracy socre for Alpha={} is: {}%".format(round(i,1),round(score_MNB*100,2)))
	if score_MNB>best_accuracy:
		best_accuracy = score_MNB
		alpha_val = i
print('-------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100,2),round(alpha_val,1)))


Accuracy socre for Alpha=0.1 is: 75.5%
Accuracy socre for Alpha=0.2 is: 75.0%
Accuracy socre for Alpha=0.3 is: 74.0%
Accuracy socre for Alpha=0.4 is: 74.5%
Accuracy socre for Alpha=0.5 is: 74.5%
Accuracy socre for Alpha=0.6 is: 74.5%
Accuracy socre for Alpha=0.7 is: 74.5%
Accuracy socre for Alpha=0.8 is: 74.5%
Accuracy socre for Alpha=0.9 is: 74.5%
Accuracy socre for Alpha=1.0 is: 74.5%
-------------------------------------------
The best accuracy is 75.5% with alpha value as 0.1


###PREDICT NEW REVIEW WITH THE MULTINOMIAL NB model

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return mnb.predict(temp)




def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review1  = 'Samma da, food was very nice'
sample_review2 = 'Thuu, food is like shit'
predict_review(sample_review1)
predict_review(sample_review2)


this is a positive review
this is a negative review


#BEROUNLLI NAIVE BAYES

In [None]:
# Bernoulli NB

# Fitting Bernoulli Naive Bayes to the Training set
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB(alpha=0.3)
bnb.fit(X_train_array, y_train)

In [None]:
y_train_preds_bnb = bnb.predict(X_train_array)
y_test_preds_bnb = bnb.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)

In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_bnb , y_test,y_test_preds_bnb)

ACCURACY SCORE
Training accuracy_score is :  77.25 %
Testing accuracy_score is :  76.0 %


PRECISION SCORE
Training precision_score is :  82.53 %
Testing precision_score is :  79.35 %


RECALL SCORE
Training recall_score is :  68.84 %
Testing recall_score is :  71.57 %


F1 SCORE
Training f1_score is :  75.07 %
Testing f1_score is :  75.26 %


ROC_AUC_SCORE
Training roc_auc_score is :  77.21 %
Testing roc_auc_score is :  76.09 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[344  58]
 [124 274]]
Testing Confusion_matrix is : 
[[79 19]
 [29 73]]


### hyperparameter Tuning for Bernoulli NB

In [None]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
	temp_classifier_BNB = BernoulliNB(alpha=i)

	temp_classifier_BNB.fit(X_train_array,y_train)

	temp_y_pred_BNB = temp_classifier_BNB.predict(X_test_array)

	score_BNB = accuracy_score(y_test,temp_y_pred_BNB)

	print("Accuracy socre for Alpha={} is: {}%".format(round(i,1),round(score_BNB*100,2)))
	if score_BNB>best_accuracy:
		best_accuracy = score_BNB
		alpha_val = i
print('-------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100,2),round(alpha_val,1)))

Accuracy socre for Alpha=0.1 is: 75.5%
Accuracy socre for Alpha=0.2 is: 75.5%
Accuracy socre for Alpha=0.3 is: 76.0%
Accuracy socre for Alpha=0.4 is: 76.0%
Accuracy socre for Alpha=0.5 is: 76.0%
Accuracy socre for Alpha=0.6 is: 76.0%
Accuracy socre for Alpha=0.7 is: 75.5%
Accuracy socre for Alpha=0.8 is: 75.5%
Accuracy socre for Alpha=0.9 is: 75.5%
Accuracy socre for Alpha=1.0 is: 75.5%
-------------------------------------------
The best accuracy is 76.0% with alpha value as 0.3


### Predict New Review with this Model

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return bnb.predict(temp)




def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review1  = 'Samma da, food was very nice'
sample_review2 = 'Thuu, food is like shit'
predict_review(sample_review1)
predict_review(sample_review2)

this is a positive review
this is a negative review


#Complement Naive Bayes

In [None]:
# Complement NB
from sklearn.naive_bayes import ComplementNB

# Create a Complement Naive Bayes classifier
cnb = ComplementNB(alpha=0.4)

# Fit the model on the training data
cnb.fit(X_train_array, y_train)

In [None]:
# Make predictions on the training and test data
y_train_preds_cnb = cnb.predict(X_train_array)
y_test_preds_cnb = cnb.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)

In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_cnb , y_test,y_test_preds_cnb)

ACCURACY SCORE
Training accuracy_score is :  76.62 %
Testing accuracy_score is :  75.5 %


PRECISION SCORE
Training precision_score is :  80.4 %
Testing precision_score is :  78.49 %


RECALL SCORE
Training recall_score is :  70.1 %
Testing recall_score is :  71.57 %


F1 SCORE
Training f1_score is :  74.9 %
Testing f1_score is :  74.87 %


ROC_AUC_SCORE
Training roc_auc_score is :  76.59 %
Testing roc_auc_score is :  75.58 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[334  68]
 [119 279]]
Testing Confusion_matrix is : 
[[78 20]
 [29 73]]


###Hyperparameter Tuning for Complement NB

In [None]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
	temp_classifier_CNB = ComplementNB(alpha=i)

	temp_classifier_CNB.fit(X_train_array,y_train)

	temp_y_pred_CNB = temp_classifier_CNB.predict(X_test_array)

	score_CNB = accuracy_score(y_test,temp_y_pred_CNB)

	print("Accuracy socre for Alpha={} is: {}%".format(round(i,1),round(score_CNB*100,2)))
	if score_CNB>best_accuracy:
		best_accuracy = score_CNB
		alpha_val = i
print('-------------------------------------------')
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100,2),round(alpha_val,1)))

Accuracy socre for Alpha=0.1 is: 75.0%
Accuracy socre for Alpha=0.2 is: 75.0%
Accuracy socre for Alpha=0.3 is: 75.0%
Accuracy socre for Alpha=0.4 is: 75.5%
Accuracy socre for Alpha=0.5 is: 75.0%
Accuracy socre for Alpha=0.6 is: 74.5%
Accuracy socre for Alpha=0.7 is: 74.5%
Accuracy socre for Alpha=0.8 is: 74.5%
Accuracy socre for Alpha=0.9 is: 74.5%
Accuracy socre for Alpha=1.0 is: 75.0%
-------------------------------------------
The best accuracy is 75.5% with alpha value as 0.4


### PREDICTING THE NEW REVIEW WITH THIS MODEL

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return cnb.predict(temp)




def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review1  = 'Samma da, food was very nice'
sample_review2 = 'Thuu, food is like shit'
predict_review(sample_review1)
predict_review(sample_review2)

this is a positive review
this is a negative review


#LOGISTIC REGRESSION

In [None]:
# Import the Logistic Regression class
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1.0)

# Fit the Logistic Regression model to the training data
lr.fit(X_train_array, y_train)


In [None]:
# Make predictions on the training and test data
y_train_preds_lr = lr.predict(X_train_array)
y_test_preds_lr = lr.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)

In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_lr , y_test,y_test_preds_lr)

ACCURACY SCORE
Training accuracy_score is :  78.75 %
Testing accuracy_score is :  77.5 %


PRECISION SCORE
Training precision_score is :  85.62 %
Testing precision_score is :  80.65 %


RECALL SCORE
Training recall_score is :  68.84 %
Testing recall_score is :  73.53 %


F1 SCORE
Training f1_score is :  76.32 %
Testing f1_score is :  76.92 %


ROC_AUC_SCORE
Training roc_auc_score is :  78.7 %
Testing roc_auc_score is :  77.58 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[356  46]
 [124 274]]
Testing Confusion_matrix is : 
[[80 18]
 [27 75]]


###Hyperparameter Tuning for Logistic Regression

In [None]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
	temp_classifier_LR = LogisticRegression(C=i)

	temp_classifier_LR.fit(X_train_array,y_train)

	temp_y_pred_LR = temp_classifier_LR.predict(X_test_array)

	score_LR = accuracy_score(y_test,temp_y_pred_LR)

	print("Accuracy socre for C={} is: {}%".format(round(i,1),round(score_LR*100,2)))
	if score_CNB>best_accuracy:
		best_accuracy = score_LR
		alpha_val = i
print('-------------------------------------------')
print('The best accuracy is {}% with C value as {}'.format(round(best_accuracy*100,2),round(alpha_val,1)))

Accuracy socre for C=0.1 is: 75.5%
Accuracy socre for C=0.2 is: 76.5%
Accuracy socre for C=0.3 is: 77.0%
Accuracy socre for C=0.4 is: 77.0%
Accuracy socre for C=0.5 is: 77.0%
Accuracy socre for C=0.6 is: 77.0%
Accuracy socre for C=0.7 is: 77.0%
Accuracy socre for C=0.8 is: 77.5%
Accuracy socre for C=0.9 is: 77.5%
Accuracy socre for C=1.0 is: 77.5%
-------------------------------------------
The best accuracy is 75.5% with C value as 0.1


###PREDICTING WITH CUSTOM REVIEW WITH THIS MODEL

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return lr.predict(temp)




def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review1  = 'Mame, sapadu was fine, they could have kept more side dish'
sample_review2 = 'Dei sambar ah da was like dog shit'
predict_review(sample_review1)
predict_review(sample_review2)

this is a negative review
this is a negative review


# K NEAREST NEIGHBOUR (KNN)

In [None]:
# Import the KNeighborsClassifier class from scikit-learn
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier with a specified number of neighbors (e.g., n_neighbors=5)
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the KNN classifier to the training data
knn.fit(X_train_array, y_train)

In [None]:
# Make predictions on the training set
y_train_preds_knn = knn.predict(X_train_array)

# Make predictions on the test set
y_test_preds_knn = knn.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)

In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_knn , y_test,y_test_preds_knn)

ACCURACY SCORE
Training accuracy_score is :  81.25 %
Testing accuracy_score is :  72.0 %


PRECISION SCORE
Training precision_score is :  87.58 %
Testing precision_score is :  78.75 %


RECALL SCORE
Training recall_score is :  72.61 %
Testing recall_score is :  61.76 %


F1 SCORE
Training f1_score is :  79.4 %
Testing f1_score is :  69.23 %


ROC_AUC_SCORE
Training roc_auc_score is :  81.21 %
Testing roc_auc_score is :  72.21 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[361  41]
 [109 289]]
Testing Confusion_matrix is : 
[[81 17]
 [39 63]]


### Hyperparameter Tuning for KNN

In [None]:
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(1,11,1):
	temp_classifier_KNN = KNeighborsClassifier(n_neighbors=i)

	temp_classifier_KNN.fit(X_train_array,y_train)

	temp_y_pred_KNN = temp_classifier_KNN.predict(X_test_array)

	score_KNN = accuracy_score(y_test,temp_y_pred_KNN)

	print("Accuracy socre for n_neighbors={} is: {}%".format(round(i,1),round(score_KNN*100,2)))
	if score_KNN>best_accuracy:
		best_accuracy = score_KNN
		alpha_val = i
print('-------------------------------------------')
print('The best accuracy is {}% with n_neighbors value as {}'.format(round(best_accuracy*100,2),round(alpha_val,1)))

Accuracy socre for n_neighbors=1 is: 67.5%
Accuracy socre for n_neighbors=2 is: 66.5%
Accuracy socre for n_neighbors=3 is: 72.0%
Accuracy socre for n_neighbors=4 is: 63.0%
Accuracy socre for n_neighbors=5 is: 67.5%
Accuracy socre for n_neighbors=6 is: 65.5%
Accuracy socre for n_neighbors=7 is: 67.0%
Accuracy socre for n_neighbors=8 is: 65.5%
Accuracy socre for n_neighbors=9 is: 69.5%
Accuracy socre for n_neighbors=10 is: 69.0%
-------------------------------------------
The best accuracy is 72.0% with n_neighbors value as 3


### PREDICTING NEW REVIEW WITH KNN

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return knn.predict(temp)




def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review_knn = 'I am going to be honest , the food was bad , sorry'
predict_review(sample_review_knn)


this is a negative review


#DECISION TREES

In [None]:
# Import the Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(max_depth=11, random_state=14)

# Fit the Decision Tree classifier to the training data
dt.fit(X_train_array, y_train)

In [None]:
# Make predictions on the training set
y_train_preds_dt = dt.predict(X_train_array)

# Make predictions on the test set
y_test_preds_dt = dt.predict(X_test_array)

In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

def print_metrics(actual_training,predicted_training , actual_testing, predicted_testing):


    # Training Scores
    acc_train = accuracy_score(actual_training,predicted_training)
    pre_train = precision_score(actual_training,predicted_training)
    recall_train = recall_score(actual_training,predicted_training)
    f1_train = f1_score(actual_training,predicted_training)
    roc_score_train = roc_auc_score(actual_training,predicted_training)
    confmat_train = confusion_matrix(actual_training,predicted_training)

    #Testing score
    acc_test = accuracy_score(actual_testing,predicted_testing)
    pre_test = precision_score(actual_testing,predicted_testing)
    recall_test = recall_score(actual_testing,predicted_testing)
    f1_test = f1_score(actual_testing,predicted_testing)
    roc_score_test = roc_auc_score(actual_testing,predicted_testing)
    confmat_test = confusion_matrix(actual_testing,predicted_testing)


    print('ACCURACY SCORE')
    print('Training accuracy_score is : ',round(acc_train*100,2),"%")
    print('Testing accuracy_score is : ',round(acc_test*100,2),"%")
    print('\n=====================================================\n')
    print('PRECISION SCORE')
    print('Training precision_score is : ',round(pre_train*100,2),"%")
    print('Testing precision_score is : ',round(pre_test*100,2),"%")
    print('\n=====================================================\n')
    print('RECALL SCORE')
    print('Training recall_score is : ',round(recall_train*100,2),"%")
    print('Testing recall_score is : ',round(recall_test*100,2),"%")
    print('\n=====================================================\n')
    print('F1 SCORE')
    print('Training f1_score is : ',round(f1_train*100,2),"%")
    print('Testing f1_score is : ',round(f1_test*100,2),"%")
    print('\n=====================================================\n')
    print('ROC_AUC_SCORE')
    print('Training roc_auc_score is : ',round(roc_score_train*100,2),"%")
    print('Testing roc_auc_score is : ',round(roc_score_test*100,2),"%")
    print('\n=====================================================\n')
    print('CONFUSION MATRIX')
    print('Training Confusion_matrix is : ')
    print(confmat_train)
    print('Testing Confusion_matrix is : ')
    print(confmat_test)

In [None]:
#Printing the METRICS for the model
print_metrics(y_train,y_train_preds_dt , y_test,y_test_preds_dt)

ACCURACY SCORE
Training accuracy_score is :  74.88 %
Testing accuracy_score is :  70.0 %


PRECISION SCORE
Training precision_score is :  94.17 %
Testing precision_score is :  87.5 %


RECALL SCORE
Training recall_score is :  52.76 %
Testing recall_score is :  48.04 %


F1 SCORE
Training f1_score is :  67.63 %
Testing f1_score is :  62.03 %


ROC_AUC_SCORE
Training roc_auc_score is :  74.76 %
Testing roc_auc_score is :  70.45 %


CONFUSION MATRIX
Training Confusion_matrix is : 
[[389  13]
 [188 210]]
Testing Confusion_matrix is : 
[[91  7]
 [53 49]]


### HyperParameter Tuning for Decision Tree

In [None]:
best_accuracy = 0.0
best_max_depth = 0
best_random_state = 0

for i in np.arange(3, 10, 1):
    for j in np.arange(10, 25, 1):
        temp_classifier_DT = DecisionTreeClassifier(max_depth=i, random_state=j)
        temp_classifier_DT.fit(X_train_array, y_train)
        temp_y_pred_DT = temp_classifier_DT.predict(X_test_array)
        score_DT = accuracy_score(y_test, temp_y_pred_DT)
        print("Accuracy score for max_depth={} and random_state={} is: {}%".format(round(i, 1), round(j, 1), round(score_RF * 100, 2)))
        if score_RF > best_accuracy:
            best_accuracy = score_DT
            best_max_depth = i
            best_random_state = j

print('-------------------------------------------')
print('The best accuracy is {}% with max_depth={} and random_state={}'.format(round(best_accuracy * 100, 2), best_max_depth, best_random_state))


Accuracy score for max_depth=3 and random_state=10 is: 74.0%
Accuracy score for max_depth=3 and random_state=11 is: 74.0%
Accuracy score for max_depth=3 and random_state=12 is: 74.0%
Accuracy score for max_depth=3 and random_state=13 is: 74.0%
Accuracy score for max_depth=3 and random_state=14 is: 74.0%
Accuracy score for max_depth=3 and random_state=15 is: 74.0%
Accuracy score for max_depth=3 and random_state=16 is: 74.0%
Accuracy score for max_depth=3 and random_state=17 is: 74.0%
Accuracy score for max_depth=3 and random_state=18 is: 74.0%
Accuracy score for max_depth=3 and random_state=19 is: 74.0%
Accuracy score for max_depth=3 and random_state=20 is: 74.0%
Accuracy score for max_depth=3 and random_state=21 is: 74.0%
Accuracy score for max_depth=3 and random_state=22 is: 74.0%
Accuracy score for max_depth=3 and random_state=23 is: 74.0%
Accuracy score for max_depth=3 and random_state=24 is: 74.0%
Accuracy score for max_depth=4 and random_state=10 is: 74.0%
Accuracy score for max_d

###PREDICTING NEW REVIEWS WITH DECISION TREE


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

def predict_sentiment(sample_review):

	sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ',string=sample_review)
	sample_review = sample_review.lower()
	sample_review_words = sample_review.split()
	sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
	ps = PorterStemmer()
	final_review = [ps.stem(word) for word in sample_review_words]
	final_review = ' '.join(final_review)

	temp = vectorizer.transform([final_review]).toarray()
	return rf.predict(temp)

def predict_review(sample_review):
	if predict_sentiment(sample_review):
		print('this is a positive review')
	else:
		print('this is a negative review')


sample_review_dt  = 'nice nice nice '
predict_review(sample_review_dt)


this is a positive review
