# Challenge
Transform this regression problem into a binary classifier and clean up the feature set. You can choose whether or not to include nutritional information, but try to cut your feature set down to the 30 most valuable features.

When you've finished that, also take a moment to think about bias. Is there anything in this dataset that makes you think it could be biased, perhaps extremely so?

There is. Several things in fact, but most glaringly is that we don't actually have a random sample. It could be, and probably is, that the people more likely to choose some kinds of recipes are more likely to give high reviews.

In [0]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import svm
from sklearn.model_selection import cross_val_score
import time
import warnings
warnings.filterwarnings(
    action="ignore")

In [0]:
df = pd.read_csv("https://www.dropbox.com/s/us8f0ltbaxgllw3/epi_r.csv?dl=1")

In [3]:
df.head(5)

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.rating.describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [5]:
 set(['rating', 'title'])

{'rating', 'title'}

In [0]:
X = df[list(set(list(df.columns)) - set(['rating', 'title']))]
y = df['rating']

In [7]:
# Classify high ratings (>4) and low ratings (<4)
y_class = y.copy()
y_class[y_class < 4] = 0
y_class[y_class > 4] = 1
y_class.value_counts()

1.0    10738
0.0     9314
Name: rating, dtype: int64

In [8]:
#Reduce features
X.describe()

Unnamed: 0,lamb chop,cambridge,custard,easter,onion,coffee grinder,sauce,rhode island,salsa,grill/barbecue,...,muffin,thanksgiving,ginger,garlic,brown rice,cookie,gourmet,leek,yuca,mustard
count,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
mean,0.002244,0.000199,0.0001,0.01167,0.11161,0.000848,0.053112,0.000399,0.000299,0.055057,...,0.000249,0.072711,0.057151,0.081937,0.001147,0.003441,0.331538,0.013266,0.000299,0.029922
std,0.047321,0.014123,0.009987,0.107397,0.314894,0.029105,0.224262,0.019971,0.017296,0.228097,...,0.015789,0.259668,0.232137,0.274275,0.033849,0.058561,0.470778,0.114412,0.017296,0.170377
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
X_null = X.isnull().sum()
null_cols = list(X_null[X_null > 0].index)
print ("We drop {} columns due to NaNs\n{}".format(len(null_cols), null_cols))
X_feat = X.drop(columns = null_cols)

We drop 4 columns due to NaNs
['protein', 'sodium', 'calories', 'fat']


In [10]:
low_means = []
for col in X_feat.columns:
    if X_feat[col].mean() < 0.1:
        low_means.append(col)
print ("We drop {} columns because they have an insignificant mean".format(len(low_means)))
X_feat = X_feat.drop(columns = low_means)

We drop 648 columns because they have an insignificant mean


In [11]:
X.describe()

Unnamed: 0,lamb chop,cambridge,custard,easter,onion,coffee grinder,sauce,rhode island,salsa,grill/barbecue,...,muffin,thanksgiving,ginger,garlic,brown rice,cookie,gourmet,leek,yuca,mustard
count,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
mean,0.002244,0.000199,0.0001,0.01167,0.11161,0.000848,0.053112,0.000399,0.000299,0.055057,...,0.000249,0.072711,0.057151,0.081937,0.001147,0.003441,0.331538,0.013266,0.000299,0.029922
std,0.047321,0.014123,0.009987,0.107397,0.314894,0.029105,0.224262,0.019971,0.017296,0.228097,...,0.015789,0.259668,0.232137,0.274275,0.033849,0.058561,0.470778,0.114412,0.017296,0.170377
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
start = time.time()
svc_model = svm.SVC()
fit = svc_model.fit(X_feat, y_class)
y_pred = svc_model.predict(X_feat)
print ("Runtime: %0.2f seconds" % (time.time() - start))

Runtime: 70.51 seconds


In [13]:
start = time.time()
svc_score = svc_model.score(X_feat, y_class)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Baseline Score: %0.3f" % (svc_score))

Runtime: 29.44 seconds
Baseline Score: 0.579


In [14]:
start = time.time()
svc_iter_model = svm.SVC(C = 100)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Iterated Score: %0.3f" % (svc_score))

Runtime: 0.00 seconds
Iterated Score: 0.579


In [0]:
# Add nutritional information
X_nut = df[list(set(list(df.columns)) - set(['rating', 'title']))]
X_nut.drop(columns = low_means, inplace = True)
X_nut = X_nut.fillna(X_nut.median())

In [16]:
majority_class_prediction = np.ones(y_class.shape)

from sklearn.metrics import accuracy_score

accuracy_score(y_class, majority_class_prediction)

0.5355076800319171

In [17]:
svc_iter_cross_val_scores = cross_val_score(svc_iter_model, X_nut, y_class, cv=5)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Mean Accuracy: %0.3f (+/- %0.3f)" % (svc_iter_cross_val_scores.mean(), svc_iter_cross_val_scores.std()))

Runtime: 534.87 seconds
Mean Accuracy: 0.603 (+/- 0.007)


We beat the majority class classifier by 0.07 accuracy. However, the dataset is biased since people are more likely to choose recipes that are more likely to give high reviews. In order to create a more accurate model, we will need to account for that bias.