In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pickle
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veluru_abhilash/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [93]:
dataset1 = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [2]:
d1 = pd.read_csv("main_data.csv")

In [3]:
d1.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,John Lasseter,Tom Hanks,Tim Allen,Don Rickles,Animation Comedy Family,toy story,Tom Hanks Tim Allen Don Rickles John Lasseter ...
1,Joe Johnston,Robin Williams,Jonathan Hyde,Kirsten Dunst,Adventure Fantasy Family,jumanji,Robin Williams Jonathan Hyde Kirsten Dunst Joe...
2,Howard Deutch,Walter Matthau,Jack Lemmon,Ann-Margret,Romance Comedy,grumpier old men,Walter Matthau Jack Lemmon Ann-Margret Howard ...
3,Forest Whitaker,Whitney Houston,Angela Bassett,Loretta Devine,Comedy Drama Romance,waiting to exhale,Whitney Houston Angela Bassett Loretta Devine ...
4,Charles Shyer,Steve Martin,Diane Keaton,Martin Short,Comedy,father of the bride part ii,Steve Martin Diane Keaton Martin Short Charles...


In [95]:
dataset1

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [4]:
d = pd.read_csv('train.csv')
#0 - negative

#1 - somewhat negative

#2 - neutral

#3 - somewhat positive

#4 - positive

In [5]:
d

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [6]:
new_df1 = d[['Phrase', 'Sentiment']]

In [7]:
new_df = new_df1.head(8000)

In [8]:
new_df

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2
...,...,...
7995,to laugh at it,2
7996,laugh at it,2
7997,at it,2
7998,Perhaps a better celebration of these unfairly...,1


In [9]:
# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [10]:
# Apply the cleaning function to the Phrase column
new_df['Phrase'] = new_df['Phrase'].apply(clean_text)

In [11]:
stopset = set(stopwords.words('english'))

In [12]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopset])

In [13]:
new_df['Phrase'] = new_df['Phrase'].apply(remove_stopwords)

In [14]:
vectorizer = TfidfVectorizer(max_features=5000)

In [15]:
x = vectorizer.fit_transform(new_df['Phrase']).toarray()
y = new_df['Sentiment']
#pickle.dump(vectorizer, open('tranform.pkl', 'wb'))

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

In [17]:
clf = MultinomialNB()
clf.fit(x_train,y_train)

MultinomialNB()

In [18]:
accuracy_score(y_test,clf.predict(x_test))*100

62.20833333333333

In [19]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree', n_jobs=-1)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='kd_tree', n_jobs=-1)

In [20]:
accuracy_score(y_test, knn.predict(x_test)) * 100

62.291666666666664

In [21]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

RandomForestClassifier()

In [22]:
accuracy_score(y_test,model.predict(x_test))*100

65.58333333333334

In [23]:
des_trees = DecisionTreeClassifier(random_state=42)
des_trees.fit(x_train, y_train)

DecisionTreeClassifier(random_state=42)

In [24]:
accuracy_score(y_test,des_trees.predict(x_test))*100

63.583333333333336

In [25]:
svm = SVC(kernel='rbf', random_state=42)
svm.fit(x_train, y_train)

SVC(random_state=42)

In [26]:
accuracy_score(y_test,svm.predict(x_test))*100

66.0

In [27]:
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs')
log_reg.fit(x_train, y_train)

LogisticRegression(multi_class='multinomial')

In [28]:
accuracy_score(y_test,log_reg.predict(x_test))*100

63.541666666666664

In [29]:
X = new_df['Phrase']  
y = new_df['Sentiment']

# Converting text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Applying SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_tfidf, y)

# Creating a DataFrame from the resampled data
X_smote_dense = X_smote.toarray()
resampled_df = pd.DataFrame(X_smote_dense, columns=vectorizer.get_feature_names_out())
resampled_df['Sentiment'] = y_smote

print(resampled_df['Sentiment'].value_counts())
pickle.dump(vectorizer, open('tranform.pkl', 'wb'))

1    4530
2    4530
3    4530
4    4530
0    4530
Name: Sentiment, dtype: int64


In [30]:
x_train, x_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.3, random_state=42)

In [31]:
# Training the Naive Bayes classifier
classifier1 = MultinomialNB()
classifier1.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier1.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 64.00%
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.91      0.78      1372
           1       0.59      0.56      0.58      1323
           2       0.57      0.29      0.38      1347
           3       0.62      0.52      0.57      1368
           4       0.67      0.91      0.77      1385

    accuracy                           0.64      6795
   macro avg       0.63      0.64      0.61      6795
weighted avg       0.63      0.64      0.62      6795



In [32]:
# Training the Naive Bayes classifier
classifier2 = KNeighborsClassifier(n_neighbors=5)
classifier2.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier2.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 76.51%
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.98      0.91      1372
           1       0.72      0.72      0.72      1323
           2       0.61      0.50      0.55      1347
           3       0.71      0.67      0.69      1368
           4       0.87      0.95      0.91      1385

    accuracy                           0.77      6795
   macro avg       0.75      0.76      0.76      6795
weighted avg       0.75      0.77      0.76      6795



In [33]:
classifier3 = RandomForestClassifier(random_state=42)
classifier3.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier3.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 83.38%
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.93      1372
           1       0.81      0.79      0.80      1323
           2       0.71      0.75      0.73      1347
           3       0.79      0.77      0.78      1368
           4       0.93      0.93      0.93      1385

    accuracy                           0.83      6795
   macro avg       0.83      0.83      0.83      6795
weighted avg       0.83      0.83      0.83      6795



In [34]:
# Training the Decision Tree classifier
classifier4 = DecisionTreeClassifier(random_state=42)
classifier4.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier4.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 82.19%
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      1372
           1       0.79      0.78      0.79      1323
           2       0.69      0.74      0.72      1347
           3       0.77      0.73      0.75      1368
           4       0.93      0.91      0.92      1385

    accuracy                           0.82      6795
   macro avg       0.82      0.82      0.82      6795
weighted avg       0.82      0.82      0.82      6795



In [35]:
classifier5 = SVC(kernel='rbf', random_state=42)
classifier5.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier5.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 84.61%
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      1372
           1       0.78      0.81      0.80      1323
           2       0.80      0.74      0.77      1347
           3       0.83      0.73      0.78      1368
           4       0.89      0.96      0.92      1385

    accuracy                           0.85      6795
   macro avg       0.84      0.85      0.84      6795
weighted avg       0.84      0.85      0.84      6795



In [36]:
# Training the Naive Bayes classifier
classifier6 = LogisticRegression(multi_class='multinomial', solver='lbfgs')
classifier6.fit(x_train, y_train)

# Making predictions on the test set
y_pred = classifier6.predict(x_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", report)

Accuracy: 73.35%
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.95      0.90      1372
           1       0.70      0.61      0.65      1323
           2       0.55      0.57      0.56      1347
           3       0.69      0.60      0.64      1368
           4       0.84      0.92      0.88      1385

    accuracy                           0.73      6795
   macro avg       0.73      0.73      0.73      6795
weighted avg       0.73      0.73      0.73      6795



In [37]:
filename = 'nlp_model.pkl'
pickle.dump(classifier5, open(filename, 'wb'))