##Decision Tree Classifier - Optimized

In [None]:
import pandas as pd
import numpy as np

## Download the Data:

In [53]:
full_df = pd.read_json("train.json")
full_df.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [None]:
test_df = pd.read_json("test.json")
test_df.head()

### Validation: Holdout Method

In a supervised learning problem one wants to determine how good the trained model is. Before the model is trained, the data is split into a training data set and a test data set. The training data set is used to train the model. The test data set is kept apart and used to determine how good the model is.

In [55]:
#This function returns a list of 100 id's for each cusine from input dataframe.
def build_df(indf):
    
    validdf_ids=[]
    random_ids=[]
    
    for k, v in indf.groupby('cuisine'):
        l=[]
        l.append([e for e in v.id])
        for i in xrange(len(l)):
            random_ids.append(np.random.choice(l[i], size=100, replace=False))
    
    validdf_ids = [item for sublist in random_ids for item in sublist]
    
    return validdf_ids

In [56]:
validation_df=full_df[full_df.id.isin(build_df(full_df))].reset_index()
train_df=full_df[~full_df.id.isin(build_df(full_df))].reset_index()

In [57]:
print "Full Dataframe"
print full_df.count()
print
print "Validation Dataframe"
print validation_df.count()
print
print "Training Dataframe"
print train_df.count()

Full Dataframe
cuisine        39774
id             39774
ingredients    39774
dtype: int64

Validation Dataframe
index          2000
cuisine        2000
id             2000
ingredients    2000
dtype: int64

Training Dataframe
index          37774
cuisine        37774
id             37774
ingredients    37774
dtype: int64


### Ingredients Map

In [58]:
#list of all possible ingredients

ingredients_all=[]

for i in full_df['ingredients']:
    for j in xrange(len(i)):
        ingredients_all.append(i[j])
        
for i in test_df['ingredients']:
    for j in xrange(len(i)):
        ingredients_all.append(i[j])

features=np.unique(ingredients_all) #create a feature list of unique ingrediants.

ingredients_map={v:k for k,v in enumerate(np.unique(ingredients_all).tolist())} #assign key to each ingredient

### Term-Document Matrix

In [59]:
def build_designmatrix(indf):
    
    designmatrix=np.zeros((len(indf), len(features))) 
    all_ids = indf['id'] #list of all id's from input set.
    
    #build design matrix
    for i in xrange(len(indf)):
        for j in indf['ingredients'][indf['id']==all_ids[i]]:
                for k in xrange(len(j)):
                        designmatrix[i, ingredients_map[j[k]]] = 1
                        
    return designmatrix

In [60]:
%%time
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(build_designmatrix(train_df), train_df['cuisine'])
clf_predict=clf.predict(build_designmatrix(validation_df))

CPU times: user 5min 28s, sys: 3.11 s, total: 5min 31s
Wall time: 5min 33s


In [61]:
from sklearn.metrics import accuracy_score
print "Accuracy score for DecisionTreeClassifier" 
print accuracy_score(np.array(validation_df['cuisine']),clf_predict)

Accuracy score for DecisionTreeClassifier
0.943


In [67]:
predictions_as_dataframe = test_df.join(pd.DataFrame({"Prediction DT": clf_predict}))
print predictions_as_dataframe

         id                                        ingredients       cuisine  \
0     18009  [baking powder, eggs, all-purpose flour, raisi...        french   
1     28583  [sugar, egg yolks, corn starch, cream of tarta...   southern_us   
2     41580  [sausage links, fennel bulb, fronds, olive oil...       spanish   
3     29752  [meat cuts, file powder, smoked sausage, okra,...  cajun_creole   
4     35687  [ground black pepper, salt, sausage casings, l...       italian   
5     38527  [baking powder, all-purpose flour, peach slice...        french   
6     19666             [grape juice, orange, white zinfandel]       italian   
7     41217  [ground ginger, white pepper, green onions, or...       chinese   
8     28753  [diced onions, taco seasoning mix, all-purpose...       mexican   
9     22659  [eggs, cherries, dates, dark muscovado sugar, ...       british   
10    21749  [pasta, olive oil, crushed red pepper, cherry ...       italian   
11    44967  [water, butter, ground suma

In [64]:
#Predict on test set and write the result out.
final_result=clf.predict(build_designmatrix(test_df))
test_df['cuisine']=final_result
test_df.to_csv("testsubmission.csv")

In [35]:
%%time
from sklearn import preprocessing
from sklearn.linear_model import Ridge
est = Ridge(alpha=10)

labels = train_df['cuisine']
le = preprocessing.LabelEncoder()
labels_fea = le.fit_transform(labels)
est_fit=est.fit(build_designmatrix(train_df), labels_fea)
est_predict=est_fit.predict(build_designmatrix(test_df))
labels_converted = le.inverse_transform(est_predict.astype('I'))



IndexError: index 28 is out of bounds for axis 0 with size 20

In [32]:
predictions_as_dataframe = test_df.join(pd.DataFrame({"Prediction R": labels_converted}))
print predictions_as_dataframe

ValueError: LabelEncoder was not fitted yet.