In [1]:
import pandas as pd
import numpy as np
import scipy as sp

In [2]:
df = pd.read_csv('train.csv')

In [3]:
features = list(df.columns[range(668)+range(673,676)])

In [4]:
features_trigram_removed = []
for feature in features:
    if len(feature.split('_')) != 3:
        features_trigram_removed.append(feature)

In [5]:
len(features_trigram_removed)

552

In [6]:
df_X = df.loc[:, features_trigram_removed]

In [7]:
df_y = df.loc[:, ['IsFoodGood', 'IsServiceGood', 'IsAmbianceGood', 'IsDealsGood','IsPriceGood']]

In [8]:
df_X

Unnamed: 0,back_try,not_good,about_place,portions,come_back,the_food,really_like,food_just,good_thing,ingredients,...,tequila,cheap,reasonably_priced,service_good,really_liked,chinese_food,will_definitely,IsRatingBad,IsRatingModerate,IsRatingGood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
df_y

Unnamed: 0,IsFoodGood,IsServiceGood,IsAmbianceGood,IsDealsGood,IsPriceGood
0,0,1,0,0,1
1,0,0,1,1,0
2,0,0,0,0,0
3,0,0,1,0,1
4,0,1,0,0,1
5,1,0,0,0,0
6,1,0,0,0,0
7,0,0,0,0,0
8,0,1,0,1,0
9,0,0,0,0,0


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [11]:
classifier_DTC = DecisionTreeClassifier()
classifier_RF = RandomForestClassifier()
classifier_KNN = KNeighborsClassifier(n_neighbors=5)

In [12]:
classifier_DTC = classifier_DTC.fit(df_X, df_y)

In [13]:
classifier_RF = classifier_RF.fit(df_X, df_y)

In [14]:
classifier_KNN = classifier_KNN.fit(df_X, df_y)

In [16]:
df_test = pd.read_csv('test.csv')

In [18]:
test_X = df_test.loc[:, features_trigram_removed]
test_y = df_test.loc[:, ['IsFoodGood', 'IsServiceGood', 'IsAmbianceGood','IsDealsGood', 'IsPriceGood']]

In [19]:
df_report = pd.DataFrame(columns=['algorithm', 'precision', 'recall', 'harmonic mean'])

In [20]:
result_DTC = classifier_DTC.predict(test_X)

In [21]:
import classify_helper as helper

In [22]:
report_DTC = helper.report_precision_and_recall(np.array(test_y), np.array(result_DTC), 'DecisionTree')

In [23]:
df_report = df_report.append(report_DTC, ignore_index=True)

In [24]:
result_RF = classifier_RF.predict(test_X)

In [25]:
report_RF = helper.report_precision_and_recall(np.array(test_y), np.array(result_RF), 'RandomForest')

In [26]:
df_report = df_report.append(report_RF, ignore_index=True)

In [27]:
result_KNN = classifier_KNN.predict(test_X)

In [28]:
report_KNN = helper.report_precision_and_recall(np.array(test_y), np.array(result_KNN), 'KNeighbors')

In [29]:
df_report = df_report.append(report_KNN, ignore_index=True)

In [30]:
df_report

Unnamed: 0,algorithm,precision,recall,harmonic mean
0,DecisionTree,0.598044,0.561386,0.579136
1,RandomForest,0.658206,0.503614,0.570625
2,KNeighbors,0.640136,0.400281,0.49256


In [31]:
import pickle
with open('df_report_trigram_removed.pkl', 'wb') as f:
    pickle.dump(df_report, f)