In [2]:
import gzip, json, random, pandas as pd
from Scripts import preprocessing as prep, cleaning as clean, loading as dl
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import model_selection
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import chi2, SelectKBest, f_classif, f_regression
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [2]:
amazon_input = '../Data/reviews_Movies_and_TV_5.json.gz'
data = []
with gzip.open(amazon_input) as f:
    for l in f:
        data.append(json.loads(l.strip()))
df = pd.DataFrame.from_dict(data)

In [25]:
ama_df = df[['reviewText', 'overall']]
len(ama_df)

1697533

In [26]:
ama_df = clean.remove_duplicates(ama_df)

Before deleting duplicate entries:  1697533
After deleting duplicate entries:  1697533


In [27]:
ama_df.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
ama_df = clean.remove_dup_text(ama_df)

In [None]:
# WITH PREPROCESSING => IMPROVING PERFORMANCE
clean_text = prep.preprocess_reviews(ama_df['reviewText'])
vectorizer = CountVectorizer(ngram_range=(1,3))
text_count = vectorizer.fit_transform(clean_text)
# ros = RandomOverSampler(random_state=777)
undersampler = RandomUnderSampler(random_state=None)
text_count_res, target_res = undersampler.fit_sample(text_count, ama_df['overall'])
print("finished balancing")
clf = MultinomialNB()
X_train, X_test, y_train, y_test = train_test_split(text_count_res, target_res, test_size=0.25, random_state=109)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = str(metrics.accuracy_score(y_test, y_pred))
precision = str(metrics.precision_score(y_test, y_pred, average="macro"))
f1 = str(metrics.f1_score(y_test, y_pred, average="macro"))
print(accuracy)
print(precision)
print(f1)
print(metrics.classification_report(y_test, y_pred, labels = [1.0, 2.0, 3.0, 4.0, 5.0]))
disp_norm = plot_confusion_matrix(clf, X_test, y_test,
                                      display_labels=[1.0, 2.0, 3.0, 4.0, 5.0],
                                      cmap=plt.cm.Blues,
                                      normalize='true')
vocab_3 = vectorizer.get_feature_names()

In [None]:
# Training target statistics: Counter({5.0: 18577, 4.0: 4243, 3.0: 993, 1.0: 246, 2.0: 243})

In [None]:
df_1 = ama_df[ama_df['overall']==1.0].values.tolist()
df_2 = ama_df[ama_df['overall']==2.0].values.tolist()
df_3 = ama_df[ama_df['overall']==3.0].values.tolist()
df_4 = ama_df[ama_df['overall']==4.0].values.tolist()
df_5 = ama_df[ama_df['overall']==5.0].values.tolist()
df1 = random.sample(df_1, 8000)
df2 = random.sample(df_2, 8000)
df3 = random.sample(df_3, 8000)
df4 = random.sample(df_4, 8000)
df5 = random.sample(df_5, 8000)
df11 = pd.DataFrame(df1[1:], columns=['reviewText', 'overall'])
df12 = pd.DataFrame(df2[1:], columns=['reviewText', 'overall'])
df13 = pd.DataFrame(df3[1:], columns=['reviewText', 'overall'])
df14 = pd.DataFrame(df4[1:], columns=['reviewText', 'overall'])
df15 = pd.DataFrame(df5[1:], columns=['reviewText', 'overall'])
amazon = pd.concat([df11, df12, df13, df14, df15], ignore_index=True)
amazon.reset_index(drop=True)

text = amazon['reviewText']
target = amazon['overall']

clean_text = pd.Series(prep.preprocess_reviews(text))




In [None]:
# cv = CountVectorizer(min_df=3, max_df=0.8, ngram_range=(1,5))

# clf = MultinomialNB()
# w = {1:2, 2:3, 3: 2, 4: 2, 5: 1 }
clf = LogisticRegression(dual=False, random_state=0, solver='lbfgs', multi_class='multinomial')
# clf = SVC(decision_function_shape="ovo")
# clf = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)


list_test = [0.1, 0.2, 0.3, 0.4, 0.5]
for i in list_test:
    # Split dataset into training set and test set
    test_size = i
    train_size = 1 - i
    X_train, X_test, y_train, y_test = train_test_split(clean_text, target, test_size=i,random_state=None)

    # vectorizer = TfidfVectorizer(min_df=2 ,max_df=0.8, use_idf=True,sublinear_tf=True, ngram_range=(1,3))
    vectorizer = CountVectorizer(min_df=2 ,max_df=0.8, ngram_range=(1,3))

    train_vectorized = vectorizer.fit_transform(X_train)
    vocab = vectorizer.get_feature_names()

    print ("Feature size of TF-IDF: ", len(vocab))
    test_vectorized = vectorizer.transform(X_test)

    vectorizer_chi2 = SelectKBest(score_func=f_regression,k=3500)

    chi_train_vectorized = vectorizer_chi2.fit_transform(train_vectorized,y_train)
    chi_test_vectorized = vectorizer_chi2.transform(test_vectorized)

    # Train the model using the training sets
    # nb.fit(X_train, y_train)
    clf.fit(chi_train_vectorized, y_train)
    #Predict the response for test dataset
    y_pred = clf.predict(chi_test_vectorized)

    print("Training and prediction done")
    print('Training target statistics: {}'.format(Counter(y_train), sorted(y_train)))
    print('Testing target statistics: {}'.format(Counter(y_test), sorted(y_test)))

    # Model Accuracy, how often is the classifier correct?
    print("train: " + str(train_size) + "/ test: " + str(test_size))
    accuracy = str(metrics.accuracy_score(y_test, y_pred))
    precision = str(metrics.precision_score(y_test, y_pred, average="macro"))
    f1 = str(metrics.f1_score(y_test, y_pred, average="macro"))
    print("Accuracy:" + accuracy)
    print("Precision:" + precision)
    print("F1:" + f1)
    print(pd.crosstab(y_test, y_pred))

    # Visualization of Confusion Matrix and saving
    plt.rcParams['figure.facecolor'] = 'white'
    title = f"Confusion matrix - Baseline(MNB, count(1,3), manually balanced, unprocessed, {train_size}_{test_size})"
    disp = plot_confusion_matrix(clf, chi_test_vectorized, y_test,
                                 display_labels=[1.0, 2.0, 3.0, 4.0, 5.0],
                                 cmap=plt.cm.Blues)
    disp.ax_.set_title(title)
    plt.show()

    title_norm = title + "_normalize"
    disp_norm = plot_confusion_matrix(clf, chi_test_vectorized, y_test,
                                 display_labels=[1.0, 2.0, 3.0, 4.0, 5.0],
                                 cmap=plt.cm.Blues,
                                 normalize='true')
    disp_norm.ax_.set_title(title_norm)
    plt.show()

In [5]:
# Get Features after Feature Selection
# mask = vectorizer_chi2.get_support()
# train_vectorized
# new_features = []
# mask = vectorizer_chi2.get_support() #list of booleans
# new_features = [] # The list of your K best features
#
# for bool, feature in zip(mask, feature_names):
#     if bool:
#         new_features.append(feature)

# vectorizer_chi2.fit(train_vectorized, y_train)
# # Get columns to keep and create new dataframe with those only
# cols = vectorizer_chi2.get_support(indices=True)
# features_df_new = train_vectorized[:,cols]
import numpy as np
np.asarray(vectorizer.get_feature_names())[vectorizer_chi2.get_support()]
top_ranked_features = sorted(enumerate(vectorizer_chi2.scores_),key=lambda x:x[1])[:100]
top_ranked_features_indices = list(map(list,zip(*top_ranked_features)))[0]
for feature_pvalue in zip(np.asarray(vectorizer.get_feature_names())[top_ranked_features_indices], vectorizer_chi2.pvalues_[top_ranked_features_indices]):
    print(feature_pvalue)

('live', 0.995997268663658)
('lancast', 0.9951785407935824)
('boxset', 0.994616183867685)
('caesar', 0.9944513193035504)
('hay', 0.9944513193035504)
('orang', 0.9944513193035504)
('palmer', 0.9944513193035504)
('patrol', 0.9944513193035504)
('pq', 0.9944513193035504)
('propheci', 0.9944513193035504)
('clarkson', 0.9942119522789737)
('eleanor', 0.9942119522789737)
('orphanag', 0.9942119522789737)
('person feel', 0.9942119522789737)
('godzilla', 0.9940517323956107)
('adjust', 0.9938602381998565)
('altman', 0.9938602381998565)
('amigo', 0.9938602381998565)
('back littl', 0.9938602381998565)
('beck', 0.9938602381998565)
('bob dylan', 0.9938602381998565)
('book howev', 0.9938602381998565)
('burt lancast', 0.9938602381998565)
('but turn', 0.9938602381998565)
('car wash', 0.9938602381998565)
('charact whose', 0.9938602381998565)
('cigarett', 0.9938602381998565)
('confer', 0.9938602381998565)
('csi', 0.9938602381998565)
('danc wolv', 0.9938602381998565)
('elf', 0.9938602381998565)
('elit', 0.9

In [96]:
print(features_df_new[0])
# print(mask[])
# print(vectorizer.get_feature_names()[0])

  (0, 828)	1
  (0, 1809)	2
  (0, 1844)	1
  (0, 2301)	1
  (0, 2722)	1
  (0, 3636)	1
  (0, 3640)	1
  (0, 4100)	1
  (0, 4105)	1
  (0, 4148)	1
  (0, 4488)	1
  (0, 4800)	3
  (0, 4829)	2


In [None]:
print('Training target statistics: {}'.format(Counter(target), sorted(target)))

In [None]:
from sklearn.metrics import roc_curve, auc

In [60]:
models = []
models.append(('LR', LogisticRegression(dual=False, random_state=10)))
models.append(('SDG', SGDClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('MNB', MultinomialNB()))
models.append(('GNB', GaussianNB()))
models.append(('SVM', SVC()))
results = []
names = []
scoring = 'f1_macro'
vectorizer = CountVectorizer(min_df=2 ,max_df=0.8, ngram_range=(1,3))
text_vectorized = vectorizer.fit_transform(clean_text)
# vectorizer_chi2 = SelectKBest(chi2,k=3000)
vectorizer_chi2 = SelectKBest(score_func=f_classif, k=3500)
chi_text_vectorized = vectorizer_chi2.fit_transform(text_vectorized,target)
seed = 7
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, chi_text_vectorized, target, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LR: 0.095058 (0.025834)
SDG: 0.098031 (0.026703)
LDA: nan (nan)
KNN: 0.047061 (0.028415)
CART: 0.063673 (0.017715)
MNB: 0.104914 (0.031478)
GNB: nan (nan)


KeyboardInterrupt: 