In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_excel(r"C:\Users\stach\Documents\IMDB-Movie-Reviews-Large-Dataset-50k\train.xlsx")

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [5]:
df.head()

Unnamed: 0,Reviews,Sentiment
0,"When I first tuned in on this morning news, I ...",neg
1,"Mere thoughts of ""Going Overboard"" (aka ""Babes...",neg
2,Why does this movie fall WELL below standards?...,neg
3,Wow and I thought that any Steven Segal movie ...,neg
4,"The story is seen before, but that does'n matt...",neg


In [7]:
import re
from unidecode import unidecode

# Custom contraction mapping
CONTRACTION_MAP = {
    "can't": "cannot",
    "won't": "will not",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'d": " would",
    "'ll": " will",
    "'t": " not",
    "'ve": " have",
    "'m": " am"
}

def cont_exp(x):
    for key in CONTRACTION_MAP.keys():
        x = x.replace(key, CONTRACTION_MAP[key])
    return x

def remove_emails(x):
    return re.sub(r'\S+@\S+', '', x)

def remove_urls(x):
    return re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE)

def remove_html_tags(x):
    return re.sub(r'<.*?>', '', x)

def remove_rt(x):
    return re.sub(r'\brt\b', '', x)

def remove_accented_chars(x):
    return unidecode(x)

def remove_special_chars(x):
    return re.sub(r'[^a-zA-Z0-9\s]', '', x)

def get_clean(x):
    x = str(x).lower().replace('\\', '').replace('_', ' ')
    x = cont_exp(x)
    x = remove_emails(x)
    x = remove_urls(x)
    x = remove_html_tags(x)
    x = remove_rt(x)
    x = remove_accented_chars(x)
    x = remove_special_chars(x)
    x = re.sub(r"(.)\1{2,}", r"\1", x)
    return x


In [8]:
df['Reviews'] = df['Reviews'].apply(lambda x: get_clean(x))

In [9]:
df.head()

Unnamed: 0,Reviews,Sentiment
0,when i first tuned in on this morning news i t...,neg
1,mere thoughts of going overboard aka babes aho...,neg
2,why does this movie fall well below standards ...,neg
3,wow and i thought that any steven segal movie ...,neg
4,the story is seen before but that doesn matter...,neg


In [14]:
tfidf = TfidfVectorizer(max_features=5000)
x = df['Reviews']
y = df['Sentiment']

x = tfidf.fit_transform(x)

In [17]:
x

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2841255 stored elements in Compressed Sparse Row format>

In [18]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [19]:
clf = LinearSVC()
clf.fit(x_train, y_train)

In [29]:
y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         neg       0.88      0.87      0.87      2480
         pos       0.87      0.88      0.88      2520

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [30]:
x='This movie is really bad'
x=get_clean(x)
vec= tfidf.transform([x])

clf.predict(vec)

array(['neg'], dtype=object)

In [31]:
import pickle

In [32]:
pickle.dump(clf, open('model','wb'))