In [1]:
from Scripts import loading as dl, preprocessing as prep
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest,f_regression, f_classif
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [2]:
movie_link = '../Data/amazon_movie.pkl'
df = dl.load_sampled(movie_link, 10000)

In [3]:
df.head()

Unnamed: 0,text,label
0,I did not learn after the first movie. I boug...,1.0
1,I won't review the movie because this has alre...,1.0
2,never explained anything and the ending was th...,1.0
3,pathetic acting and unrealistic jujitsu fight ...,1.0
4,Incredible (as in unbelievable) that three suc...,1.0


In [4]:
text = prep.preprocess_reviews(df.text)
target = df.label

In [5]:
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.25,random_state=7, stratify=df.label)

In [None]:
print('Training target statistics: {}'.format(Counter(y_train), sorted(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test), sorted(y_test)))
vectorizer = TfidfVectorizer(min_df=5,max_df=0.9, ngram_range=(1,3))
train_vectorized = vectorizer.fit_transform(X_train)
test_vectorized = vectorizer.transform(X_test)

vectorizer_fs = SelectKBest(score_func=f_classif, k=1000)
fs_train_vectorized = vectorizer_fs.fit_transform(train_vectorized,y_train)
fs_test_vectorized = vectorizer_fs.transform(test_vectorized)
clf = LogisticRegression(C=1.0, dual=False, fit_intercept=True, random_state=0, solver='lbfgs', intercept_scaling=1, max_iter=100, multi_class='multinomial', class_weight='balanced')
clf.fit(fs_train_vectorized, y_train)
y_pred = clf.predict(fs_test_vectorized)
accuracy = str(metrics.accuracy_score(y_test, y_pred))
precision = str(metrics.precision_score(y_test, y_pred, average="macro"))
recall = str(metrics.recall_score(y_test, y_pred))
f1 = str(metrics.f1_score(y_test, y_pred, average="macro"))
print("Accuracy:" + accuracy)
print("Precision:" + precision)
print("F1:" + f1)
print(pd.crosstab(y_test, y_pred))

# Visualization of Confusion Matrix and saving
plt.rcParams['figure.facecolor'] = 'white'
title = f"Confusion matrix 75_25)"
disp = plot_confusion_matrix(clf, fs_test_vectorized, y_test,
                             display_labels=[1.0, 2.0, 3.0, 4.0, 5.0],
                             cmap=plt.cm.Blues)
disp.ax_.set_title(title)
plt.show()

title_norm = title + "_normalize"
disp_norm = plot_confusion_matrix(clf, fs_test_vectorized, y_test,
                             display_labels=[1.0, 2.0, 3.0, 4.0, 5.0],
                             cmap=plt.cm.Blues,
                             normalize='true')
disp_norm.ax_.set_title(title_norm)
plt.show()

# Tfidf
# Accuracy:0.5044
# Precision:0.49877848211061393
# F1:0.5005075581374712

# Count
# Accuracy:0.49352
# Precision:0.48421926835444573
# F1:0.4849920759957114

In [None]:
import numpy as np

# pipeline = Pipeline([
#     ('vect', TfidfVectorizer()),
#     ('select',SelectKBest()),
#     ('clf',LogisticRegression(fit_intercept=True, dual=False, random_state=123, verbose=3))
# ])
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf',LogisticRegression())
])
# parameters = [{
#     'vect__max_df': (0.5, 0.75, 0.8, 0.9, 1.0),
#     'vect__ngram_range': ((1,1), (1,2), (1,3)),
#     'vect__min_df': (2,3,4,5),
#     'vect__norm': ('l1','l2'),
#     'vect__sublinear_tf': (True, False),
#     'select__score_func': (f_classif, f_regression),
#     'select__k': (500,1000,1500,2000,2500,3000,3500,4000,4500,5000),
#     'clf__C': (np.logspace(-4,4,20),1.0),
#     'clf__solver': ('liblinear', 'lbfgs', 'saga'),
#     'clf__max_iter': (100,200,500,1000),
#     'clf__class_weight': (None, 'balanced'),
#     'clf__multi_class': ('ovr', 'multinomial')
# },{
#     'vect': (CountVectorizer(),),
#     'vect__max_df': (0.5, 0.75, 0.8, 0.9, 1.0),
#     'vect__ngram_range': ((1,1), (1,2), (1,3)),
#     'vect__min_df': (2,3,4,5),
#     'vect__norm': ('l1','l2'),
#     'vect__sublinear_tf': (True, False),
#     'select__score_func': (f_classif, f_regression),
#     'select__k': (500,1000,1500,2000,2500,3000,3500,4000,4500,5000),
#     'clf__C': (np.logspace(-4,4,20),1.0),
#     'clf__solver': ('liblinear', 'lbfgs', 'saga'),
#     'clf__max_iter': (100,200,500,1000),
#     'clf__class_weight': (None, 'balanced'),
#     'clf__multi_class': ('ovr', 'multinomial')
# }]
parameters = [{
    'vect__max_df': (0.5, 0.75, 0.8, 0.9, 1.0),
    'vect__ngram_range': ((1,1), (1,2), (1,3)),
    'vect__min_df': (2,3,4,5),
    'vect__norm': ('l1','l2'),
    'vect__sublinear_tf': (True, False),
    'clf__multi_class': ('ovr', 'multinomial')
},{
    'vect': (CountVectorizer(),),
    'vect__max_df': (0.5, 0.75, 0.8, 0.9, 1.0),
    'vect__ngram_range': ((1,1), (1,2), (1,3)),
    'vect__min_df': (2,3,4,5),
    'vect__norm': ('l1','l2'),
    'vect__sublinear_tf': (True, False),
    'clf__multi_class': ('ovr', 'multinomial')
}]
grid_search = GridSearchCV(
    estimator = pipeline,
    param_grid = parameters,
    cv = 5,
    n_jobs = -1,
    verbose = 3
)

grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 960 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   47.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 30.9min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 56.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 97.0min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed: 142.2min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed: 210.5min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed: 299.4min


In [10]:
grid_search.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'scorer_'