In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
df = pd.read_csv("../data/cleaned_data.csv")
df

Unnamed: 0,sentiment,review
0,1,reviewers mentioned watching oz episode youll ...
1,1,wonderful little production filming technique ...
2,1,thought wonderful way spend hot summer weekend...
3,0,basically theres family little boy jake thinks...
4,1,petter matteis love money visually stunning wa...
...,...,...
49995,1,thought right job wasnt creative original firs...
49996,0,bad plot bad dialogue bad acting idiotic direc...
49997,0,catholic taught parochial elementary schools n...
49998,0,going disagree previous comment side maltin se...


In [6]:
tfidf = TfidfVectorizer(max_features=10000)

In [7]:
X = df["review"]
y = df["sentiment"]

X = tfidf.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# SVC

In [6]:
svc = LinearSVC()
svc.fit(X_train, y_train)

LinearSVC()

In [7]:
y_pred = svc.predict(X_test)

In [8]:
accuracy_score = accuracy_score(y_test, y_pred)

In [10]:
print('SVC model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_pred)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, y_pred))

SVC model accuracy is 87.77%
------------------------------------------------
Confusion Matrix:
      0     1
0  7189  1092
1   926  7293
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      8281
           1       0.87      0.89      0.88      8219

    accuracy                           0.88     16500
   macro avg       0.88      0.88      0.88     16500
weighted avg       0.88      0.88      0.88     16500



In [14]:
import pickle
import os

files = os.listdir("../models/")
filename = f"../models/model_{len(files)}.pkl"

with open(filename, 'wb') as fout:
    pickle.dump((tfidf, svc), fout)