In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from joblib import Parallel, delayed, cpu_count

import pandas as pd

In [6]:
def evaluate():
    df = pd.read_csv('Reviews.csv').sample(568000)
    X = df['Text']
    lemmatizer = WordNetLemmatizer()
    X_lemmatized = X.apply(lambda y: ' '.join([lemmatizer.lemmatize(q) for q in y.lower().split()]))
    y = df['Score']
    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, random_state=0)

    pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                                    ('model', LogisticRegression(max_iter=1000))])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy_test = balanced_accuracy_score(y_test, y_pred)
    y_pred_train = pipeline.predict(X_train)
    accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    return accuracy_test, accuracy_train, pipeline

In [None]:
evaluation_results = evaluate()
pipeline = evaluation_results[2]
print(evaluation_results[0], evaluation_results[1])

In [None]:
words = pipeline['vectorizer'].get_feature_names_out()
coefs = pipeline['model'].coef_
classes = pipeline['model'].classes_

sorted_coef_indexes = coefs.argsort(axis=1)
print(classes)
top5words = words[sorted_coef_indexes[0,:20]]
print(top5words)
top5coefs = coefs[0,sorted_coef_indexes[0,:20]]
print(top5coefs)
top5words = words[sorted_coef_indexes[0,-20:]]
print(top5words)
top5coefs = coefs[0,sorted_coef_indexes[0,-20:]]
print(top5coefs)

In [2]:
def train(i):
    df = pd.read_csv('Reviews.csv').sample(i*1000)
    X = df['Text']
    lemmatizer = WordNetLemmatizer()
    X_lemmatized = X.apply(lambda y: ' '.join([lemmatizer.lemmatize(q) for q in y.lower().split()]))
    y = df['Score']
    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, random_state=0)

    pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                                    ('model', LogisticRegression(max_iter=1000))])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy_test = balanced_accuracy_score(y_test, y_pred)
    y_pred_train = pipeline.predict(X_train)
    accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    return (i*1000, accuracy_test), (i*1000, accuracy_train)

In [None]:
num_processors = cpu_count()
results = Parallel(n_jobs=num_processors)(delayed(train)(i) for i in tqdm(range(1, 569)))
print("All processes are done")

In [None]:
x_list_test = []
y_list_test = []
x_list_train = []
y_list_train = []
x_list_previous = []
y_list_previous = []

data = {
    'sample_size': [0],
    'accuracy_test': [0.0],
    'accuracy_train': [1.0]
}
df_to_save = pd.DataFrame(data)

for index in tqdm(range(0, len(results))):
    x_list_test.append(results[index][0][0])
    y_list_test.append(results[index][0][1])
    x_list_train.append(results[index][1][0])
    y_list_train.append(results[index][1][1])
    x_list_previous.append(results[index][0][0])
    y_list_previous.append(0.7982)

    df_to_save.loc[len(df_to_save)] = [results[index][0][0], results[index][0][1], results[index][1][1]]

df_to_save.to_csv('results_AI.csv', index=False)
print('CSV file "results_AI.csv" has been updated successfully.')

plt.plot(x_list_test, y_list_test, 'r')
plt.plot(x_list_train, y_list_train, 'b')
plt.plot(x_list_previous, y_list_previous, 'g')

plt.xlabel('Sample Size')
plt.ylabel('Mean Accuracy')

plt.savefig('graph.png')

In [11]:
def topic():
    df = pd.read_csv('Reviews.csv').sample(568000)
    X = df['Text']
    lemmatizer = WordNetLemmatizer()
    X_lemmatized = X.apply(lambda y: ' '.join([lemmatizer.lemmatize(q) for q in y.lower().split()]))
    y = df['Score']
    X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y, test_size=0.2, random_state=0)

    pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True)),
                         ('nmf', NMF(n_components=20)),
                         ('model', LogisticRegression(max_iter=1000))])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy_test = balanced_accuracy_score(y_test, y_pred)
    y_pred_train = pipeline.predict(X_train)
    accuracy_train = balanced_accuracy_score(y_train, y_pred_train)
    return accuracy_test, accuracy_train, pipeline

In [None]:
topic_model_results = topic()
pipeline = topic_model_results[2]
print(topic_model_results[0], topic_model_results[1])

In [None]:
def print_words_in_topics(nmf, vectorizer):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(nmf.components_):
        print(f"Topic {idx}")
        for i in topic.argsort()[-6:]:
            print(words[i])
        print()
    print()
    
print_words_in_topics(pipeline['nmf'], pipeline['vectorizer'])