# IMDB Datasets from Kaggle

In [1]:
import pandas as pd
from sklearn.svm import SVC
import joblib

data = pd.read_csv('IMDB_Dataset.csv')
data.sample(n=10)

Unnamed: 0,review,sentiment
16310,I saw this movie in sixth grade around Christm...,positive
11301,Refreshing `lost' gem! Featuring effective dia...,positive
48601,"I love old ""monster movies"" for the pure camp ...",negative
48454,This film gets off to a bad start. An incredib...,positive
16928,Meatballs has been a main staple in my family ...,positive
19705,I think Dark Angel is great! First season was ...,positive
47795,It is hard to judge 'Imaginary Heroes' without...,positive
49953,It's really rare that you get an inside view a...,positive
1442,I purchased a DVD of this film for a dollar at...,negative
44777,Not too many people seem to know about this mo...,positive


In [2]:
data['label'] = data['sentiment'].replace(['positive', 'negative'],['1', '0'])

In [3]:
data.sample(n=10)

Unnamed: 0,review,sentiment,label
46250,"Slow, odd film that drags and plods (I mean re...",negative,0
9219,A young couple Mandy Pullman (Mitch Martin) an...,negative,0
1111,I saw this movie a fews years ago and was lite...,positive,1
47901,I've never seen many online movies in most of ...,positive,1
15437,This is a VERY entertaining movie. A few of th...,positive,1
37808,I think the majority of the people seem not th...,positive,1
400,What can you say about the film White Fire. Am...,positive,1
11784,"Aunt Cora had always been tactless, and her we...",positive,1
44297,Ruth Gordon is one of the more sympathetic kil...,positive,1
5683,I wish that all the mockumentaries and horror ...,negative,0


In [4]:
data['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

In [5]:
data.to_csv('IMDB_Dataset_label.csv')

# Preprocessing

In [6]:
#Text Preprocessing for sentiment analysis
import string
import emoji
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

stopwords = nltk.corpus.stopwords.words('english')

class AntonymReplacer(object):
    # Class for replacing negations with their antonyms
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

def preprocess(text):
    
    #1. Generating the list of words in the tweet (hastags and other punctuations removed)
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)
    
    #2. clean the number 
    text = re.sub(r'[0-9]', '', text)
    
    #3. lower the text
    text = text.lower()
    
    #4. conver the emoji to text form
    text = emoji.demojize(text)
    
    #5. remove punctuation 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    #6. tokenize the text
    text = word_tokenize(text)
    
    #7. remove empty token
    text = [t for t in text if len(t) > 0]
    
    #8. remove non-alphabetical token
    text = [t for t in text if t.isalpha()]
    
    #9. replace the negation token
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    #10. remove the stopwords
    text = [i for i in text if i not in stopwords]
    
    #11. stem the text
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(w) for w in text]
    
    return text


In [7]:
# Example of text preprocessing

print(data['review'].iloc[12])
print(preprocess(data['review'].iloc[12]))

So im not a big fan of Boll's work but then again not many are. I enjoyed his movie Postal (maybe im the only one). Boll apparently bought the rights to use Far Cry long ago even before the game itself was even finsished. <br /><br />People who have enjoyed killing mercs and infiltrating secret research labs located on a tropical island should be warned, that this is not Far Cry... This is something Mr Boll have schemed together along with his legion of schmucks.. Feeling loneley on the set Mr Boll invites three of his countrymen to play with. These players go by the names of Til Schweiger, Udo Kier and Ralf Moeller.<br /><br />Three names that actually have made them selfs pretty big in the movie biz. So the tale goes like this, Jack Carver played by Til Schweiger (yes Carver is German all hail the bratwurst eating dudes!!) However I find that Tils acting in this movie is pretty badass.. People have complained about how he's not really staying true to the whole Carver agenda but we on

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer Parameters:
# - min_df=2: Discard words appearing in fewer than 2 documents.
# - max_df=0.9: Discard words appearing in more than 90% of the documents.
# - sublinear_tf=True: Use sublinear weighting for term frequency scaling.
# - use_idf=True: Enable Inverse Document Frequency (IDF) weighting.

vec = TfidfVectorizer(
    analyzer=preprocess,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    use_idf=True
)

tfidf_model = vec.fit(data['review'])
train_vec = vec.transform(data['review'])

In [9]:
import pickle
#save vectorizer
vectorizer_file = 'tfidf_vectorizer.sav'
joblib.dump(tfidf_model, open(vectorizer_file, 'wb'))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer



# Create CountVectorizer instance with desired parameters
count_vec = CountVectorizer(
    analyzer=preprocess,
    min_df=2,
    max_df=0.9
)

# Fit CountVectorizer on the training data
count_model = count_vec.fit(data['review'])
train_vec_count = count_vec.transform(data['review'])

# Save the CountVectorizer model
count_vectorizer_file = 'count_vectorizer.sav'
joblib.dump(count_model, open(count_vectorizer_file, 'wb'))

# Combine CountVectorizer and TF-IDF features
from scipy.sparse import hstack
combined_train_vec = hstack([train_vec_count, train_vec])

In [11]:
train_vec = vec.transform(data['review'])
train_vec_count = count_vec.transform(data['review'])


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Split data
SEED = 4000
X_train_count, X_test_count, y_train, y_test = train_test_split(train_vec_count, data.label, test_size=0.2, random_state=SEED)
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(train_vec, data.label, test_size=0.2, random_state=SEED)


# Model Training and Evaluation

In [13]:
# Train and evaluate SVM with CountVectorizer features
clf_svc_count = SVC(kernel='linear', random_state=39)
clf_svc_count.fit(X_train_count, y_train)
predictions_count = clf_svc_count.predict(X_test_count)
print('Results with CountVectorizer:')
print(classification_report(predictions_count, y_test))
print('Confusion matrix: \n', confusion_matrix(predictions_count, y_test))
print('Accuracy score: ', accuracy_score(predictions_count, y_test))

# Train and evaluate SVM with TfidfVectorizer features
clf_svc_tfidf = SVC(kernel='linear', random_state=39)
clf_svc_tfidf.fit(X_train_tfidf, y_train)
predictions_tfidf = clf_svc_tfidf.predict(X_test_tfidf)
print('Results with TfidfVectorizer:')
print(classification_report(predictions_tfidf, y_test))
print('Confusion matrix: \n', confusion_matrix(predictions_tfidf, y_test))
print('Accuracy score: ', accuracy_score(predictions_tfidf, y_test))

# Save the models
joblib.dump(clf_svc_count, open('SVM_count_model.sav', 'wb'))
joblib.dump(clf_svc_tfidf, open('SVM_tfidf_model.sav', 'wb'))

Results with CountVectorizer:
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      4921
           1       0.87      0.85      0.86      5079

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion matrix: 
 [[4257  664]
 [ 749 4330]]
Accuracy score:  0.8587
Results with TfidfVectorizer:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      4946
           1       0.90      0.89      0.90      5054

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Confusion matrix: 
 [[4449  497]
 [ 557 4497]]
Accuracy score:  0.8946


# Model Saving

In [14]:
# Load saved models
filename_svc_count = 'SVM_count_model.sav'
filename_svc_tfidf = 'SVM_tfidf_model.sav'
loaded_clf_svc_count = joblib.load(open(filename_svc_count, 'rb'))
loaded_clf_svc_tfidf = joblib.load(open(filename_svc_tfidf, 'rb'))

# Load saved vectorizers
count_vectorizer_file = 'count_vectorizer.sav'
tfidf_vectorizer_file = 'tfidf_vectorizer.sav'
loaded_count_vectorizer = joblib.load(open(count_vectorizer_file, 'rb'))
loaded_tfidf_vectorizer = joblib.load(open(tfidf_vectorizer_file, 'rb'))