In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('data_part1.csv')

In [3]:
data

Unnamed: 0.1,Unnamed: 0,id,gender,season,age,tripletsOfMonths,commonRed,commonGreen,commonBlue
0,0,15970,Men,Fall,2,4,254,254,90
1,1,39386,Men,Summer,2,3,40,53,59
2,2,59263,Women,Winter,2,1,234,234,234
3,3,21379,Men,Fall,2,4,50,50,52
4,4,53759,Men,Summer,2,3,0,0,0
...,...,...,...,...,...,...,...,...,...
44441,44441,17036,Men,Summer,2,3,241,242,234
44442,44442,6461,Men,Summer,2,3,223,220,213
44443,44443,18842,Men,Fall,2,4,144,191,221
44444,44444,46694,Women,Spring,2,2,253,253,253


In [15]:
gender_encoder = OneHotEncoder()
genders = np.unique(data['gender'])

gender_encoder.fit([genders])
gender_encoder.categories_

[array(['Boys'], dtype=object),
 array(['Girls'], dtype=object),
 array(['Men'], dtype=object),
 array(['Unisex'], dtype=object),
 array(['Women'], dtype=object)]

In [44]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

data_train, data_test = train_test_split(data, test_size=.2)

In [26]:
# going to try to predict gender from the other categories
X = data_train[['age', 'tripletsOfMonths', 'commonRed', 'commonGreen', 'commonBlue']]
Y = data_train[['gender']].values.reshape(-1,)

In [28]:
# First going to see how KNeighbors performs 
for n in range(1, 8):
    K = KNeighborsClassifier(n_neighbors=n)
    cv = cross_val_score(K, X, Y, cv=5)
    
    print(f'Average Cross Val score with {n} neighbors is {cv.mean()}')

Average Cros Val score with 1 neighbors is 0.5608332374990883
Average Cros Val score with 2 neighbors is 0.5580768257926415
Average Cros Val score with 3 neighbors is 0.5714083038699055
Average Cros Val score with 4 neighbors is 0.5768645165311305
Average Cros Val score with 5 neighbors is 0.5813083478297459
Average Cros Val score with 6 neighbors is 0.5817300949636957
Average Cros Val score with 7 neighbors is 0.5831646268341053


In [33]:
from sklearn.tree import DecisionTreeClassifier

# Try out Decision Tree Classifier to see what we get
for n in range(1, 8):
    T = DecisionTreeClassifier(max_depth=n)
    
    cv = cross_val_score(T, X, Y, cv=5)
    
    print(f'Average cross Val score with tree of {n} depth is {cv.mean()}')

Average cross Val score with tree of 1 depth is 0.5457306793021285
Average cross Val score with tree of 2 depth is 0.6180109587974967
Average cross Val score with tree of 3 depth is 0.6220609627600516
Average cross Val score with tree of 4 depth is 0.6380356349950702
Average cross Val score with tree of 5 depth is 0.6378668625850821
Average cross Val score with tree of 6 depth is 0.640707729702821
Average cross Val score with tree of 7 depth is 0.649257614155986


In [52]:
# try optimizing params
# will also throw RandomForests in there for giggles
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
param_grid = [{
        'estimator': [KNeighborsClassifier()],
        'estimator__n_neighbors': list(range(1, 10)),
        'estimator__n_jobs':[-1]
    },
    {
        'estimator': [DecisionTreeClassifier()],
        'estimator__max_depth': list(range(1, 10))
    },
    {
        'estimator': [RandomForestClassifier()],
        'estimator__n_estimators':[10, 50, 100, 300],
        'estimator__max_depth': list(range(1, 8)),
        'estimator__n_jobs':[-1]
    }
]

In [53]:
# initilize pipeline for use with grid search
pipe = Pipeline(steps=[('estimator', KNeighborsClassifier())])

# initilize grid search
grid = GridSearchCV(pipe, param_grid)

In [54]:
# find best estimator with params
grid.fit(X,Y)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('estimator',
                                        KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             metric='minkowski',
                                                             metric_params=None,
                                                             n_jobs=None,
                                                             n_neighbors=5, p=2,
                                                             weights='uniform'))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'estimator': [KNeighborsClassifier(algorithm='auto',
                                                             leaf_size=30,
                                                             m.

In [55]:
# print best params and score
print(grid.best_params_)
print(grid.best_score_)

{'estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=7, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False), 'estimator__max_depth': 7, 'estimator__n_estimators': 100, 'estimator__n_jobs': -1}
0.6596636827020164


We can see that the best score of ~.66 comes from a RandomForestClassifier with depth 7 and 100 trees. 

The best score for KNeighbors was ~.58 with 7 neighbors. This implies that around half the time, the model was able to predict what age group the item was for, based on color and season. 

The best score for DecisionTreeClassifier was ~.65 with a tree of depth 8. A little over half the time the model was able to predict what the age was. 

Honestly, these score are actually a lot better than I thought they would be. I didn't really see much relationship between color and season, to age group. One thing to note is that the 'Age' catgory only inclues 3 values. Meaning that even with random guessing, the model could still achieve a score of around ~.33 farily easy. Also adding more range in parameters for GridSearchCV to search through could alter these scores.

For the ranking of models, RandomForestClassifier in first, DecisionTrees in second, and Kneighbors in third, I do believe this to make some sense based on their underlying theory. Since the data has little correlation, and some variables have few possible outcomes i.e 4 values for TripletOfMonths and 3 for Age, it would be easier to make accuarate "Cuts" on the data opposed to finding the nearest data points. This would lead to Decision Trees outperforming Kneighbors and Random Forest, aka a bunch of Decision Trees, outperforming a single Tree.