In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install beautifulsoup4
# !pip install textblob
# !pip install mlxtend
# !pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git --upgrade --force-reinstall

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import preprocess_kgptalkie as ps
# nltk.download('punkt')


In [2]:
import os
data = pd.read_csv(os.listdir()[2])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
# check the balance of the sentiment column
data['sentiment'].value_counts().reset_index()

Unnamed: 0,sentiment,count
0,positive,25000
1,negative,25000


## Preprocessing

In [5]:

def remove_stop_words(text):
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens =[token for token in tokens if token.lower() not in stop_words]
    return " ".join(tokens)

In [6]:
data['review'] = data['review'].apply(remove_stop_words)

In [7]:
data

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,wonderful little production . < br / > < br / ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically 's family little boy ( Jake ) thinks...,negative
4,Petter Mattei 's `` Love Time Money '' visuall...,positive
...,...,...
49995,thought movie right good job . n't creative or...,positive
49996,"Bad plot , bad dialogue , bad acting , idiotic...",negative
49997,Catholic taught parochial elementary schools n...,negative
49998,'m going disagree previous comment side Maltin...,negative


In [8]:
data['review'] = data['review'].apply(lambda x: remove_stop_words(x))

In [9]:
data

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,wonderful little production . < br / > < br / ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically 's family little boy ( Jake ) thinks...,negative
4,Petter Mattei 's `` Love Time Money `` visuall...,positive
...,...,...
49995,thought movie right good job . n't creative or...,positive
49996,"Bad plot , bad dialogue , bad acting , idiotic...",negative
49997,Catholic taught parochial elementary schools n...,negative
49998,'m going disagree previous comment side Maltin...,negative


In [10]:

def lemmatize_text(text):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [11]:
data['review'] = data['review'].apply(lemmatize_text)

In [12]:
data

Unnamed: 0,review,sentiment
0,One reviewer mentioned watching 1 Oz episode '...,positive
1,wonderful little production . < br / > < br / ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically 's family little boy ( Jake ) think ...,negative
4,Petter Mattei 's `` Love Time Money `` visuall...,positive
...,...,...
49995,thought movie right good job . n't creative or...,positive
49996,"Bad plot , bad dialogue , bad acting , idiotic...",negative
49997,Catholic taught parochial elementary school nu...,negative
49998,'m going disagree previous comment side Maltin...,negative


In [13]:
def clean_text2(text):
    import re
    text = re.sub(r'[^a-zA-Z0-9\s]','', text) # remove special characters
    text = re.sub(r'\s+', ' ', text) # remove extra whit spaces
    text = re.sub(r'<[^>]+>', '', text) # remove html tags
    return text

In [14]:
data['review']= data['review'].apply(lambda x: clean_text2(x))
data

Unnamed: 0,review,sentiment
0,One reviewer mentioned watching 1 Oz episode l...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically s family little boy Jake think s zom...,negative
4,Petter Mattei s Love Time Money visually stunn...,positive
...,...,...
49995,thought movie right good job nt creative origi...,positive
49996,Bad plot bad dialogue bad acting idiotic direc...,negative
49997,Catholic taught parochial elementary school nu...,negative
49998,m going disagree previous comment side Maltin ...,negative


In [15]:
print(f"{data['review'][9000]} -> {data['sentiment'][9000]}") # this pass mark

movie try hard something s good movie want fooled begining end failsFrom start get interesting fall apart re hoping ending give clue going nt br br  -> negative


## Training dataset and evaluate
#### Using Pipeline to containarize two functions

In [16]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.25, random_state=42)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Container for TfidfVectorizer and the MultinomialNB
clf = Pipeline([('tfidf', TfidfVectorizer()), ('nb_classifier', MultinomialNB())])
clf.fit(X_train, y_train)

In [18]:
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('nb_classifier', MultinomialNB())])

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Evaluate the performance of the model using classification, confussion matrix and accuracy score
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.85      0.89      0.87      6157
    positive       0.89      0.85      0.87      6343

    accuracy                           0.87     12500
   macro avg       0.87      0.87      0.87     12500
weighted avg       0.87      0.87      0.87     12500

0.86984


In [20]:
from sklearn.ensemble import RandomForestClassifier

# create an object of the class and train the model
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42)
rf_clf = Pipeline([('tfidf', TfidfVectorizer()), ('rf_classifier', rf_clf)])
rf_clf.fit(X_train, y_train)

# Evaluate the performance of the model using classification, confussion matrix and accuracy score
y_pred = rf_clf.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))
print(y_pred)

Accuracy Score: 0.76664
Classification Report:               precision    recall  f1-score   support

    negative       0.73      0.84      0.78      6157
    positive       0.82      0.69      0.75      6343

    accuracy                           0.77     12500
   macro avg       0.77      0.77      0.77     12500
weighted avg       0.77      0.77      0.77     12500

Confussion Matrix: [[5188  969]
 [1948 4395]]
Accuracy Score: 0.76664
Classification Report:               precision    recall  f1-score   support

    negative       0.73      0.84      0.78      6157
    positive       0.82      0.69      0.75      6343

    accuracy                           0.77     12500
   macro avg       0.77      0.77      0.77     12500
weighted avg       0.77      0.77      0.77     12500

Confussion Matrix: [[5188  969]
 [1948 4395]]
['negative' 'positive' 'negative' ... 'negative' 'positive' 'positive']


In [None]:
from sklearn.linear_model import LogisticRegression

# create an object of the class and train the model
log_reg = Pipeline([('tfidf', TfidfVectorizer()), ('model', LogisticRegression(random_state=42))])
log_reg.fit(X_train.values, y_train.values)

# Evaluate the performance of the model using classification, confussion matrix and accuracy score
y_pred = log_reg.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))

from joblib import dump, load

# Save the model
dump(log_reg, 'logistic_reg.pkl')


# Hyperparameter tuning to adjust the model parameters in order to improve its performance
# Using GridSearchCV class of sklearn

from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__C': [0.1, 0.5, 1, 2, 3, 5, 10], # regularization strength
    'model__penalty': [None, 'l2'], # regularization type
    'model__max_iter': [1000, 5000, 10000]
    }
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train.values, y_train.values)

# get the best model with optimized hyperparameters
# best_model = grid_search.best_estimator_

# make prediction with best model
y_predict = grid_search.predict(X_test)

# evaluate the best model
print('best model ', grid_search.best_params_)
print('best score ', grid_search.best_score_)
print('Confussion Matrix:', confusion_matrix(y_test, y_predict))
print('Accuracy Score:', accuracy_score(y_test, y_predict))
print('Classification Report:', classification_report(y_test, y_predict))

Accuracy Score: 0.8988
Classification Report:               precision    recall  f1-score   support

    negative       0.91      0.89      0.90      6157
    positive       0.89      0.91      0.90      6343

    accuracy                           0.90     12500
   macro avg       0.90      0.90      0.90     12500
weighted avg       0.90      0.90      0.90     12500

Confussion Matrix: [[5456  701]
 [ 564 5779]]




In [None]:
# evaluate the best model
print('best model ', grid_search.best_params_)
print('best score ', grid_search.best_score_)
print('Confussion Matrix:', confusion_matrix(y_test, y_predict))
print('Accuracy Score:', accuracy_score(y_test, y_predict))
print('Classification Report:', classification_report(y_test, y_predict))
y_predict

In [None]:
# train a classifier with the optimal hyperparameters
log_reg = LogisticRegression(**grid_search.best_params_)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

# evaluate the classifier on the test data
print('accuracy score :', accuracy_score(y_test, y_pred))
print('classification: ', classification_report(y_test, y_pred))
print('confusion matrix: ', confusion_matrix(y_test, y_pred))

In [None]:
# pip install --upgrade jupyter

In [None]:
from sklearn.tree import DecisionTreeClassifier

# create an object of the class and train the model
decision_tree = Pipeline([('tfidf', TfidfVectorizer()), ('DT_classifier', DecisionTreeClassifier(random_state=42))])
decision_tree.fit(X_train.values, y_train.values)

# Evaluate the performance of the model using classification, confussion matrix and accuracy score
y_pred = decision_tree.predict(X_test)

print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(kernel='linear', C=1, random_state=42)


# create an object of the class and train the model
svm_clf = Pipeline([('tfidf', TfidfVectorizer()), ('svm_classifier', SVC(kernel='linear', C=1, random_state=42))])
svm_clf.fit(X_train.values, y_train.values)

# Evaluate the performance of the model using classification, confussion matrix and accuracy score
y_pred = svm_clf.predict(X_test)

print('Accuracy Score:\n', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confussion Matrix:\n', confusion_matrix(y_test, y_pred))

## Visualizing results

In [None]:
# Confusion matrix visualization
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.show()

In [None]:
# classificatio report visualization
class_report = classification_report(y_test, y_pred, output_dict=True)
sns.heatmap(pd.DataFrame(class_report).iloc[:-1, :-1], annot=True, cmap='Blues')
# plt.figure(figsize=(50, 170))
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.show()

In [None]:
# Roc auc visualization
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_test_num = le.fit_transform(y_test)
y_pred_num = le.transform(y_pred)

auc = roc_auc_score(y_test_num, y_pred_num)
fpr, tpr, thresholds = roc_curve(y_test_num, y_pred_num)
plt.plot(fpr, tpr, label='AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# Precision recall visualization
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test_num, y_pred_num)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid()
plt.show()

## Saving the model

In [None]:
from joblib import dump, load

# Save the model
dump(log_reg, 'logistic_reg.pkl')

In [None]:
# Unseen data 
unseen_data = [
    "thought movie right good job nt creative origi.",
    "This movie is hate; I like it!",
]
# Xs = vectorizer.transform(unseen_data)
y_pred = log_reg.predict(unseen_data)
y_pred

In [None]:
# Load the Model
load_model = load('logistic_reg.pkl')
load_model.predict(unseen_data)

## Second Method

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
data['review'] = data['review'].str.lower()
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['review'])
y = data['sentiment']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [None]:
y_pred = nb_classifier.predict(X_test)
y_pred

""" to solve this problem 'AttributeError                            Traceback (most recent call last)
C:\Users\HOLART~1\AppData\Local\Temp/ipykernel_9900/1904468047.py in <module>
----> 1 nb_classifier.predict(['Wow, this is amazing lesson'].reshape(-1, 1))
AttributeError: list object has no attribute reshape"""

data

## Model Evaluation

In [None]:
from sklearn .metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression

# create an object of the class
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# evaluate model performance
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

# create an object of the class
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)

print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# create an object of the class
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))
print(y_pred)

In [None]:
from sklearn.svm import SVC

# create an SVM classifier object
svm_clf = SVC(kernel='linear', C=1, random_state=42)
svm_clf.fit(X_train, y_train)
svm_clf.predict(X_test)

y_pred = svm_clf.predict(X_test)
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))
print('Confussion Matrix:', confusion_matrix(y_test, y_pred))
print(y_pred)

exploring other SVM kernels to see which one works best for our dataset

In [None]:
# define a list of kernels
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

# loop through each kernel and train an SVM model
for kernel in kernels:
    # create an object of svm
    svm_clf = SVC(kernel=kernel, C=1, random_state=42)
    
    # train model on the data
    svm_clf.fit(X_train, y_train)
    
    # make prediction on the testing data
    y_pred = svm_clf.predict(X_test)
    
    # evaluate the model's peformance
    print('Kernel: ', kernel)
    print('Accuracy Score:', accuracy_score(y_test, y_pred))
    print('classification report:', classification_report(y_test, y_pred))
    print('confussion matrix:', confusion_matrix(y_test, y_pred))

tune other hyperparameters like C and degree

In [None]:
# define a list of kernels
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

# define a dictionary of hyperparameters to try for each kernel
param_grid = {
    'linear': {'C':[0.1,1,10]},
    'poly': {'C':[0.1, 1, 10], 
             'degree':[2, 3, 4]},
    'rbf':{'C':[0.1, 1, 10],
          'gamma':['scale', 'auto']},
    'sigmoid':{'C':[0.1, 1, 10]}
             }

# loop through each kernel and train an SVM model
for kernel in kernels:
    # create an object of svm
    svm_clf = SVC(kernel=kernel, C=1, random_state=42)
    
    # perform grid search over the hyperparameters for the current kernel
    grid_search = GridSearchCV(svm_clf, param_grid[kernel], cv=5, scoring='f1_macro')
    
    
    # train model on the data
    grid_search.fit(X_train, y_train)
    
    # make prediction on the testing data
    y_pred = grid_search.predict(X_test)
    
    # evaluate the model's peformance
    print('Kernel: ', kernel)
    print('Accuracy Score: ', accuracy_score(y_test, y_pred))
    print('classification: ', classification_report(y_test, y_pred))
    print('confusion matrix: ', confusion_matrix(y_test, y_pred))
    print('best params ',grid_search.best_params_)
    print('best score ',grid_search.best_score_)
    

In [None]:
# train an sm classifier with the optimal hyperparameters
svm_clf = SVC(**grid_search.best_params_)
svm_clf.fit(X_train, y_train)
y_pred = svm_clf.predict(X_test)

# evaluate the classifier on the test data
print('accuracy score :', accuracy_score(y_test, y_pred))
print('classification: ', classification_report(y_test, y_pred))
print('confusion matrix: ', confusion_matrix(y_test, y_pred))

## Optimizing the model performance

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1, 2, 3, 5, 10]}
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
print('Best params:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)

## Visualizing the Results

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.show()

## Classification Report 

In [None]:
class_report = classification_report(y_test, y_pred, output_dict=True)
sns.heatmap(pd.DataFrame(class_report).iloc[:-1, :-1], annot=True, cmap='Blues')
# plt.figure(figsize=(50, 170))
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.show()

## ROC-AUC Curve Visualization

coverting the data type to numeric value


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_test_num = le.fit_transform(y_test)
y_pred_num = le.transform(y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

auc = roc_auc_score(y_test_num, y_pred_num)
fpr, tpr, thresholds = roc_curve(y_test_num, y_pred_num)
plt.plot(fpr, tpr, label='AUC')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.legend()
plt.show()

## Precision Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test_num, y_pred_num)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid()
plt.show()

## Feature Importance Plot

In [None]:
from sklearn.metrics import f1_score
import numpy as np
def calculate_feature_importance(X, y):
    feature_importances = []
    for feature in range(X.shape[1]):
        feature_importances.append(np.abs(X[y==0, feature].mean()-X[y==1, feature].mean()))
        return feature_importances
    
feature_importance = calculate_feature_importance(X_train, y_train)
# feature_names = X_train.columns
# plt.figure(figsize = (10, 6))
# sns.barplot(x=feature_names, y=feature_importances)
# plt.xlabel('Feature')
# plt.ylabel('Importance')
# plt.show()
feature_importance

In [None]:
# calculate correlation matrix
import seaborn as sns
corr_matrix = X_train.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True)
plt.show()

In [None]:
nb_classifier = MultinomialNB(**grid_search.best_params_)
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

In [None]:
y_pred

In [None]:
unseen_data = [
    "thought movie right good job nt creative origi.",
    "This movie is hate; I like it!",
]
Xs = vectorizer.transform(unseen_data)
y_pred = nb_classifier.predict(Xs)
y_pred

## Save the Model

In [None]:
from joblib import dump, load

# Save the model
dump(nb_classifier, 'nb_classifier.pkl')

In [None]:
# Load the Model
load_model = load('nb_classifier.pkl')
load_model.predict(Xs)

In [None]:
# Save vectorizer
dump()

In [None]:
RFC_model = RandomForestClassifier()

RFC_model.fit(X_train_cv, y_train)