In [5]:
import pandas as pd
import numpy as np
import os
import glob
import duckdb as db

from helpers import load_full_df, delete_columns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

CSV_DIR_STRING = 'data/csv_files/'
JSON_DIR_STRING = 'data/json_files/'
TARGET_COL_NAME = 'label'

In [6]:
df = load_full_df(CSV_DIR_STRING, JSON_DIR_STRING)
delete_columns(df, columns_to_delete=['id_x', 'id_y', 'Unnamed: 0'])

In [7]:
df['vine'] = df['vine'].apply(lambda x: True if x == 'Y' else False)
df['verified_purchase'] = df['verified_purchase'].apply(lambda x: True if x == 'Y' else False)

In [18]:
df.head()

Unnamed: 0,product_id,product_parent,product_title,vine,verified_purchase,review_headline,review_body,review_date,marketplace_id,product_category_id,country_marketplace,category_name,label
0,B001N2MZT8,903886718,Green Zone [DVD],False,True,green zone,I found at first it was a little difficult to ...,2010-11-15,1,3,UK,Video DVD,False
1,B005ZC7BLO,501966104,Bones - Season 7 [DVD],False,True,Bones Season 7,If you already like Bones this may not be quit...,2012-12-12,1,3,UK,Video DVD,False
2,B000NTPCHE,671553677,To Catch A Thief [DVD] [1955],False,False,Lovely film,"I've always enjoyed this film, spectacular set...",2012-04-30,1,3,UK,Video DVD,True
3,B008BR79C6,410760130,Captain America [Blu-ray] [Region Free],False,True,Best of the Rest,"Surprised how much I enjoyed this movie, Capt ...",2014-03-12,1,3,UK,Video DVD,False
4,B007I1QUYE,38597042,Spider-Man Trilogy [Blu-ray] [Region Free],False,True,AWESOME 3D,"WHAT A FANTASTIC MOVIE, WOW 3D IS AWESOME",2014-10-06,1,3,UK,Video DVD,False


In [13]:
df['review_date'] = df['review_date'].astype(str)

In [17]:
df['review_date'].min()

'1997-01-02'

In [19]:
qad_df = df.copy()
delete_columns(qad_df, columns_to_delete=['country_marketplace', 'category_name', 'product_id', 'product_title', 'review_headline', 'review_body', 'review_date'])

In [20]:
qad_df.head()

Unnamed: 0,product_parent,vine,verified_purchase,marketplace_id,product_category_id,label
0,903886718,False,True,1,3,False
1,501966104,False,True,1,3,False
2,671553677,False,False,1,3,True
3,410760130,False,True,1,3,False
4,38597042,False,True,1,3,False


In [21]:
y = qad_df.pop('label')
X = qad_df

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
y_pred = clf.predict(X_test)

In [25]:
accuracy_score(y_test, y_pred)

0.605735896627797

In [27]:
test_df = pd.read_csv('test_hidden.csv')
validation_df = pd.read_csv('validation_hidden.csv')

In [28]:
test_df['vine'] = test_df['vine'].apply(lambda x: True if x == 'Y' else False)
test_df['verified_purchase'] = test_df['verified_purchase'].apply(lambda x: True if x == 'Y' else False)
validation_df['vine'] = validation_df['vine'].apply(lambda x: True if x == 'Y' else False)
validation_df['verified_purchase'] = validation_df['verified_purchase'].apply(lambda x: True if x == 'Y' else False)

In [30]:
test_df = test_df[['product_parent', 'vine', 'verified_purchase', 'marketplace_id', 'product_category_id']]

In [31]:
validation_df = validation_df[['product_parent', 'vine', 'verified_purchase', 'marketplace_id', 'product_category_id']]

In [32]:
y_pred_test = clf.predict(test_df)

In [33]:
y_pred_val = clf.predict(validation_df)

In [40]:
y_pred_test = y_pred_test.astype(str)
y_pred_val = y_pred_val.astype(str)

In [42]:
np.savetxt("quick_and_dirty_test.txt", y_pred_test, delimiter=",", fmt='%s')
np.savetxt("quick_and_dirty_val.txt", y_pred_val, delimiter=",", fmt='%s')