In [1]:
# 1 import basic library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# 2. NLP & Text Processing
import re  # For text cleaning
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
# remove html tags
import re
def remove_html_regex(text):
    return re.sub(r'<.*?>', '', text)

df['review'] = df['review'].apply(remove_html_regex)

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# tokenization of sentence
import nltk


from nltk.tokenize import sent_tokenize

def tokenize_sentences(text):
    return sent_tokenize(text)

df['tokenized_sentences'] = df['review'].apply(tokenize_sentences)

print(df[['review', 'tokenized_sentences']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. The filming tec...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                 tokenized_sentences  
0  [One of the other reviewers has mentioned that...  
1  [A wonderful little production., The filming t...  
2  [I thought this was a wonderful way to spend t...  
3  [Basically there's a family where a little boy...  
4  [Petter Mattei's "Love in the Time of Money" i...  


In [8]:
# remove stop words
from nltk.corpus import stopwords
# Download stopwords 
#nltk.download('stopwords')
# Load English stopwords
stop_words = set(stopwords.words('english'))

In [9]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words) 


In [10]:
df['cleaned_review'] = df['review'].apply(remove_stopwords)

print(df[['review', 'cleaned_review']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. The filming tec...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  One reviewers mentioned watching 1 Oz episode ...  
1  wonderful little production. filming technique...  
2  thought wonderful way spend time hot summer we...  
3  Basically there's family little boy (Jake) thi...  
4  Petter Mattei's "Love Time Money" visually stu...  


In [11]:
# word tokenized
from nltk.tokenize import word_tokenize

def remove_stopwords_and_tokenize(text):
    words = word_tokenize(text) 
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return filtered_words 


In [12]:
df['tokenized_words'] = df['review'].apply(remove_stopwords_and_tokenize)

# Print first few rows
print(df[['review', 'tokenized_words']].head())


                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. The filming tec...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                     tokenized_words  
0  [One, reviewers, mentioned, watching, 1, Oz, e...  
1  [wonderful, little, production, ., filming, te...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [Basically, 's, family, little, boy, (, Jake, ...  
4  [Petter, Mattei, 's, ``, Love, Time, Money, ''...  


In [13]:
# use word embedding(word2Vec)
#!pip install gensim numpy nltk

from gensim.models import Word2Vec
reviews = df['tokenized_words'].tolist()

word2vec_model = Word2Vec(sentences=reviews, vector_size=100, window=5, min_count=1, workers=4)

word_vector = word2vec_model.wv['wonderful'] 
print(word_vector)

[ 2.141266    0.44812045  0.7965468  -0.55993086 -0.45098642  1.1238328
  1.0250868   2.9376867   0.43996313 -1.3816146   2.1923113  -1.1568161
  1.0837629   2.6234047  -1.5302914  -0.9083638  -0.5117327   1.777478
 -0.46710727 -0.90856266  0.721203   -3.4194717  -2.1320288  -0.37314343
 -1.1324222  -0.22327313  0.75541145 -1.2586553   0.17559348 -1.5052001
 -0.9356716   0.17586544  0.84579915  0.14343198 -1.1787841  -1.4240793
  2.815342   -0.64938015 -1.5322659   1.1596559  -0.16240345 -0.4900868
  1.0820887  -1.4792577   0.6545315   0.53689367  0.18038182  1.9251276
 -0.7024451  -0.515174    0.22734095  0.4577177   0.13182044  1.3936839
 -1.9978437   0.23743246  2.4867368  -1.0799055  -0.58304363 -0.19325909
  1.3435607   0.5216389  -1.4937156  -0.16611275  0.20091897  1.3620743
  1.1202123  -1.5793463  -1.1495384   0.18709421  0.761489   -0.3597924
  0.7805278  -0.33039725  2.5047946   1.7251036   1.4187278   0.38232225
  1.0753374   0.6679003  -0.6901265   0.49880332 -1.7897638  -

In [14]:
def get_review_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
df['review_embedding'] = df['tokenized_words'].apply(lambda tokens: get_review_vector(tokens, word2vec_model))

print(df[['review', 'review_embedding']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. The filming tec...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                    review_embedding  
0  [0.11308393, 0.3685359, 0.10288127, 0.57589424...  
1  [0.07165608, 0.27100298, 0.31562686, 0.3812131...  
2  [0.12705946, 0.118480854, -0.13264547, 0.43205...  
3  [0.017053263, 0.049639154, -0.005259812, 0.506...  
4  [0.2621307, 0.25827566, 0.011479268, 0.4058862...  


In [15]:
# Convert sentiment column to numeric (1 for positive, 0 for negative)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print(df[['review', 'sentiment']].head())

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. The filming tec...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


In [16]:
# train test split
from sklearn.model_selection import train_test_split

X = np.vstack(df['review_embedding'].values)
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))

Training samples: 40000
Testing samples: 10000


In [17]:
# train a machine learning model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [18]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8269
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.82      0.82      4961
           1       0.82      0.84      0.83      5039

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



In [19]:
# train svm model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


svm_clf = SVC(kernel='linear', random_state=42)
svm_clf.fit(X_train, y_train)


y_pred_svm = svm_clf.predict(X_test)

In [20]:
# Evaluate performance
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.8535
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      4961
           1       0.85      0.86      0.86      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [21]:
# train by gridsearchcv
from sklearn.model_selection import GridSearchCV


param_grid = {
    'C': [0.1, 1],  # Regularization strength
    'kernel': ['linear', 'rbf', 'poly'],  # Different kernels
    'gamma': ['scale', 'auto']  # Gamma settings
}


svm_grid = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=2)
svm_grid.fit(X_train, y_train)

print("Best Parameters:", svm_grid.best_params_)
best_svm = svm_grid.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 2.1min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.9min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.9min
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time= 1.6min
[CV] END ....................C=0.1, gamma=scale,

In [22]:
y_pred_best_svm = best_svm.predict(X_test)
print("Tuned SVM Accuracy:", accuracy_score(y_test, y_pred_best_svm))

Tuned SVM Accuracy: 0.8574


In [32]:
import pickle

with open('svm_sentiment_model_pkl', 'wb') as model_file:
    pickle.dump(svm_clf, model_file)

print("SVM Model saved as 'svm_sentiment_model_pkl' ")

SVM Model saved as 'svm_sentiment_model_pkl' 


In [34]:
with open('svm_sentiment_model_pkl','rb') as model_file:
   mp = pickle.load(model_file)  


In [35]:
import os

# Get desktop path
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop", "svm_sentiment_model.pkl")

# Save the model to the desktop
with open(desktop_path, 'wb') as model_file:
    pickle.dump(svm_clf, model_file)
