In [2]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
data = Path('../data/vader_emolex.csv')
df = pd.read_csv(data)

In [4]:
bins = [0, 1.5, 4.5, 5]
labels = ['Negative', 'Neutral', 'Positive']
df['bins'] = pd.cut(df['stars'], bins=bins, labels = labels)

In [5]:
n = 50000
subset = df.groupby('bins').head(n)

In [6]:
test = subset.sample(frac=1, random_state=1)
target = test['bins'].tolist()
text = test['text'].tolist()

In [7]:
# Vectoriser breaks text into single words and bi-grams and turns text into matrix
vectoriser = TfidfVectorizer(ngram_range=(1,3))
vectors = vectoriser.fit_transform(text)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(vectors, target, random_state=42)

In [9]:
classifier = LinearSVC()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [10]:
X_null, X_full_test, y_null, y_full_test = train_test_split(vectors, target, random_state=42)
predict_all = classifier.predict(X_full_test)

In [11]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    Negative       0.86      0.92      0.89     12563
     Neutral       0.75      0.65      0.69     12491
    Positive       0.79      0.84      0.82     12446

    accuracy                           0.80     37500
   macro avg       0.80      0.80      0.80     37500
weighted avg       0.80      0.80      0.80     37500



In [12]:
from joblib import dump
dump(classifier, 'model_svm.joblib')
dump(vectoriser, 'vectorizer_svm.joblib')

['vectorizer_svm.joblib']