In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
%matplotlib inline

In [2]:
df_raw = pd.read_csv('epi_r.csv', low_memory=False)
df_raw.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df_raw['rating'].median()

4.375

In [4]:
df = df_raw.drop('title', 1)

df['rating'] = np.where(df['rating'] >= 3.71, 1, 0)
df = df.dropna()

In [5]:
X = df.drop('rating', 1)
y = df.rating

rfc = ensemble.RandomForestClassifier(100)
rfc.fit(X, y)
variables = sorted(zip(map(lambda x: round(x, 4), rfc.feature_importances_), X.columns), 
             reverse=True)

In [6]:
title_var = []
for titles in variables[:30]:
    title_var.append(titles[1])

In [7]:
new_X = df[title_var]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.3)

In [9]:
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
print('R2 Train Score: ', svc.score(X_train, y_train))

print('Cross Validation :', cross_val_score(svc, X_train, y_train, cv=5))

R2 Train Score:  0.9580331412103746
Cross Validation : [0.81728173 0.81773177 0.82162162 0.81621622 0.81711712]


In [16]:
y_pred = svc.predict(X_test)
print(f1_score(y_test, y_pred))

y_pred2 = svc.predict(X_train)
print(f1_score(y_train, y_pred2))

0.9044013251301467
0.9744909130720385


In [14]:
print('R2 Test Score: ', svc.score(X_test, y_test))

print('Cross Validation :', cross_val_score(svc, X_test, y_test, cv=5))

R2 Test Score:  0.8302521008403362
Cross Validation : [0.81951731 0.82056663 0.81092437 0.82229232 0.82439537]


In [17]:
nutrition = new_X.drop(['calories', 'sodium', 'fat', 'protein'], 1)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(nutrition, y, test_size=0.3)

In [19]:
svc.fit(X_train, y_train)

print('R2 Train Score: ', svc.score(X_train, y_train))

print('Cross Validation :', cross_val_score(svc, X_train, y_train, cv=5))

R2 Train Score:  0.8101585014409222
Cross Validation : [0.81053105 0.80594327 0.809095   0.81441441 0.81081081]


In [20]:
print('R2 Test Score: ', svc.score(X_test, y_test))

print('Cross Validation :', cross_val_score(svc, X_test, y_test, cv=5))

R2 Test Score:  0.8153361344537815
Cross Validation : [0.81427072 0.81951731 0.81512605 0.81808623 0.81913775]


In [21]:
y_pred = svc.predict(X_test)
print(f1_score(y_test, y_pred))

y_pred2 = svc.predict(X_train)
print(f1_score(y_train, y_pred2))

0.8969640135974681
0.8936856969941497


In [22]:
nutr = new_X[['calories', 'sodium', 'fat', 'protein']]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(nutr, y, test_size=0.3)

In [24]:
svc.fit(X_train, y_train)

print('R2 Train Score: ', svc.score(X_train, y_train))

print('Cross Validation :', cross_val_score(svc, X_train, y_train, cv=5))

R2 Train Score:  0.9893731988472623
Cross Validation : [0.82440342 0.82530392 0.82125169 0.82530392 0.82162162]


In [25]:
print('R2 Test Score: ', svc.score(X_test, y_test))

print('Cross Validation :', cross_val_score(svc, X_test, y_test, cv=5))

R2 Test Score:  0.8325630252100841
Cross Validation : [0.81322141 0.81722689 0.81722689 0.81302521 0.81598318]


In [27]:
y_pred = svc.predict(X_test)
print('F1 Test Score: ', f1_score(y_test, y_pred))

y_pred2 = svc.predict(X_train)
print('F1 Train Score: ',f1_score(y_train, y_pred2))

F1 Test Score:  0.9053556584728655
F1 Train Score:  0.9934495392472521


Would use the third and final model as it has the highest F1 test score with the fewest number of features.