In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from joblib import Parallel, delayed, cpu_count

import pandas as pd




In [2]:
def train(i):
    df = pd.read_csv('Reviews.csv').sample(i*1000)
    stemmer = SnowballStemmer("english")
    X = df['Text']
    X_stemmed = X.apply(lambda x: ' '.join([stemmer.stem(p) for p in x.lower().split()]))
    lemmatizer = WordNetLemmatizer()
    X_lemmatized = X.apply(lambda y: ' '.join([lemmatizer.lemmatize(q) for q in y.lower().split()]))
    y = df['Score']
    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, random_state=0)

    pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                                    ('model', LogisticRegression(max_iter=1000))])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy_test = balanced_accuracy_score(y_test, y_pred)
    y_pred_train = pipeline.predict(X_train)
    accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    return (i*1000, accuracy_test), (i*1000, accuracy_train)

In [None]:
num_processors = cpu_count()
results = Parallel(n_jobs=num_processors)(delayed(train)(i) for i in tqdm(range(1, 569)))
print("All processes are done")

In [None]:
x_list_test = []
y_list_test = []
x_list_train = []
y_list_train = []
x_list_previous = []
y_list_previous = []

data = {
    'sample_size': [0],
    'accuracy_test': [0.0],
    'accuracy_train': [1.0]
}
df_to_save = pd.DataFrame(data)

for index in tqdm(range(0, len(results))):
    x_list_test.append(results[index][0][0])
    y_list_test.append(results[index][0][1])
    x_list_train.append(results[index][1][0])
    y_list_train.append(results[index][1][1])
    x_list_previous.append(results[index][0][0])
    y_list_previous.append(0.7982)

    df_to_save.loc[len(df_to_save)] = [results[index][0][0], results[index][0][1], results[index][1][1]]

df_to_save.to_csv('results_AI.csv', index=False)
print('CSV file "results_AI.csv" has been updated successfully.')

plt.plot(x_list_test, y_list_test, 'r')
plt.plot(x_list_train, y_list_train, 'b')
plt.plot(x_list_previous, y_list_previous, 'g')

plt.xlabel('Sample Size')
plt.ylabel('Mean Accuracy')

plt.savefig('graph.png')

In [None]:
# Using accuracy_score:

# X: 0.6140000000000001
# X_stemmed mean score: 0.615
# X_lemmatized mean score: 0.61375
# X_lemmatized of X_stemmed mean score: 0.6147500000000001

# Using balanced_accuracy_score:

# X_stemmed mean score: 0.19892086330935252
# X_lemmatized mean score: 0.1998561151079137
# X_lemmatized of X_stemmed mean score: 0.19899280575539569

# Without Topic Modeling
# X_lemmatized mean score: 0.23873887902664884

sum_accuracies = 0
for i in results:
    sum_accuracies += i[0][1]

mean_accuracy = sum_accuracies/10
print(mean_accuracy)