In [3]:
### Data Loaders
import pandas as pd

### Reproducability
import numpy as np
random_seed = 42
np.random.seed(random_seed)

### Plotting
import matplotlib.pyplot as plt

### Feature Extractors
from sklearn.feature_extraction.text import TfidfVectorizer

### Models
from sklearn.svm import LinearSVC

### Pipelining
from sklearn.pipeline import Pipeline

### Hyperparameter tuning & Model selection
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

### Persistence
from joblib import dump, load

In [4]:
data = pd.read_excel('../Downloads/datasets.xls',sheet_name="IMDb", header=None)
data.columns=["X",'y']
data = data.sample(frac=1) #Shuffle

In [5]:
data.sample(n=20)

Unnamed: 0,X,y
37392,A young couple decides to runaway to sunny Cal...,NEGATIVE
32610,The first von Trier movie i've ever seen was b...,POSITIVE
34985,This interesting film noir features three very...,POSITIVE
18583,Sometime I fail to understand what do the dire...,NEGATIVE
44015,I saw it at Cinema MK2 Hautefeuille just one n...,POSITIVE
342,Directed by Diane Keaton and adapted from a bo...,NEGATIVE
28434,This is comedy as it once was and comparing th...,POSITIVE
37054,This movie was a heart-felt piece of cinema th...,POSITIVE
13264,After reading tons of good reviews about this ...,POSITIVE
37142,publicity got me to the theatre<br /><br />adv...,NEGATIVE


In [6]:
### Remove <br /> tokens
X = data['X'].str.replace('<br />','')

In [7]:
### Convert y from Positive & Negative labels to 1 & 0 labels
y = data['y'].map(lambda x: 1 if x=="POSITIVE" else 0)

In [8]:
y.value_counts()

y
1    25000
0    25000
Name: count, dtype: int64

In [9]:
pipe = Pipeline(steps=[
    ('vec',TfidfVectorizer()),
    ('svc',LinearSVC())])
params ={
    "vec__stop_words":[None],
    "vec__ngram_range":[(1,1),(1,2)],
    "vec__min_df":[0.01,0.001,0.0001,0],
    "vec__max_df":[0.99,0.999,0.9999,1],
    "svc__penalty":['l1','l2'],
    "svc__C":[1,2,5,10],
    "svc__max_iter":[6000],
    "svc__random_state":[42],
}

In [None]:
search = GridSearchCV(pipe, params, scoring='f1', cv=5, verbose=2)
search.fit(X,y)

In [28]:
print("Best estimator: ", search.best_estimator_)
print("Best score: ", search.best_score_)
print("Best params: ", search.best_params_)

Best estimator:  Pipeline(steps=[('vec', TfidfVectorizer(ngram_range=(1, 2))),
                ('svc', LinearSVC(max_iter=10000, random_state=42))])
Best score:  0.9107523444173637
Best params:  {'svc__max_iter': 10000, 'svc__random_state': 42, 'vec__ngram_range': (1, 2), 'vec__stop_words': None}


In [29]:
res = pd.DataFrame(search.cv_results_)
res.to_csv("results_SVC.csv")

In [30]:
res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__max_iter,param_svc__random_state,param_vec__ngram_range,param_vec__stop_words,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,5.273558,0.30415,4.347243,0.209705,10000,42,"(1, 1)",,"{'svc__max_iter': 10000, 'svc__random_state': ...",0.895524,0.893407,0.894466,0.001058,2
1,22.020233,0.952747,13.625245,0.244632,10000,42,"(1, 2)",,"{'svc__max_iter': 10000, 'svc__random_state': ...",0.91183,0.909675,0.910752,0.001077,1


In [96]:
best_params = {'svc__C': 1, 'svc__max_iter': 10000, 'svc__penalty': 'l2', 'svc__random_state': 42, 'vec__max_df': 0.9, 'vec__min_df': 0.0, 'vec__ngram_range': (1, 2), 'vec__stop_words': None}

In [97]:
vectorizer = TfidfVectorizer(**{ i[5:]:best_params[i] for i in best_params if i.find('vec')!=-1})
svc = LinearSVC(**{ i[5:]:best_params[i] for i in best_params if i.find('svc')!=-1})
X_ = vectorizer.fit_transform(X)
svc.fit(X_,y)



In [98]:
y_ = svc.predict(X_)

In [99]:
pd.Series(y_).value_counts()

0    25003
1    24997
Name: count, dtype: int64

In [100]:
test = "I like this movie, but it has a bad plot"
if(svc.predict(vectorizer.transform([test]))==1):
    print("Positive")
else:
    print("Negative")

Negative


In [17]:
dump(svc, 'sentiment_2.joblib')

['sentiment_2.joblib']

In [18]:
dump(vectorizer, 'tfidf_2.joblib')

['tfidf_2.joblib']