In [3]:
import json
import string
import time
import gc
import glob
from io import StringIO

import joblib
import numpy as np
import pandas as pd
from IPython.display import display
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

pd.set_option('future.no_silent_downcasting', True)
verbose = True

In [4]:
chunk_files = sorted(glob.glob('files/review_chunks/zip_cleaned_data_0*.csv'))
zip_df = pd.concat([pd.read_csv(file) for file in chunk_files], ignore_index = True)
zip_df.to_csv('files/zip_cleaned_data_FULL.csv')

In [7]:
print(zip_df.shape)
print(zip_df['label'].value_counts())

(2100000, 3)
label
1    2100000
Name: count, dtype: int64


In [None]:
# structure of the dataframe should be [text, rating, label]
# vectorize with tf-idf
vizer = TfidfVectorizer()

x_text = vizer.fit_transform(zip_df['text'])

if verbose:
    print(x_text.shape)

In [None]:
# combine the sparse matrix with the dense ratings column

# turn into 2d array
rate_feature = zip_df['rating'].values.reshape(-1, 1)

# combine vectorized text and ratings
# data
X = hstack([x_text, rate_feature])

# target labels
y = zip_df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

c_params = [1.0, 0.7, 0.5, 0.3]
clf_names = []

# train the thing || trying differnt C score on SVM and Logistic Regression
models = {}

for c in c_params:
    # create a svm model with this c value
    name = 'clf_svm' + str(c)
    clf_names.append(name)
    models[name] = LinearSVC(C=c)

    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')

    # create a logistic regression model with this c value
    name = 'clf_log' + str(c)
    clf_names.append(name)
    models[name] = LogisticRegression(C=c, max_iter=1000)

    t0 = time.time()
    if verbose:
        print(f'Training {name}')
    # actual training
    models[name].fit(X_train, y_train)
    if verbose:
        print(f'finished after {time.time() - t0} seconds')


if verbose:
    print('Finished training')

In [None]:
for name in clf_names:
    y_pred = models[name].predict(X_test)
    print(f'Classification report for {name}')
    print(classification_report(y_test, y_pred))

In [None]:
# nothing really beat the SVM with C=1.0
# going to save that one and the vectorized (tf-idf)
# important because the model trained on this
# any new words might give me garbage.
# joblib.dump(models['clf_svm1.0'], 'models/support_svm.pkl')
# joblib.dump(models['clf_log1.0'], 'models/support_log.pkl')
# joblib.dump(models['clf_for'], 'models/support_for.pkl')
# joblib.dump(vizer, 'models/support_vectorizer.pkl')