In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, make_scorer
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('data_part1.csv')

In [3]:
data

Unnamed: 0.1,Unnamed: 0,id,gender,season,age,tripletsOfMonths,commonRed,commonGreen,commonBlue
0,0,15970,Men,Fall,2,4,254,254,90
1,1,39386,Men,Summer,2,3,40,53,59
2,2,59263,Women,Winter,2,1,234,234,234
3,3,21379,Men,Fall,2,4,50,50,52
4,4,53759,Men,Summer,2,3,0,0,0
...,...,...,...,...,...,...,...,...,...
44441,44441,17036,Men,Summer,2,3,241,242,234
44442,44442,6461,Men,Summer,2,3,223,220,213
44443,44443,18842,Men,Fall,2,4,144,191,221
44444,44444,46694,Women,Spring,2,2,253,253,253


In [4]:
train, test = train_test_split(data)

In [5]:
gender_encoder = LabelEncoder()
genders = np.unique(data['gender'])
gender_encoder.fit(genders)
gender_encoder.transform(['Men'])

array([2], dtype=int64)

In [6]:
X = train[['age', 'tripletsOfMonths', 'commonRed', 'commonGreen' ,'commonBlue']]
Y = train['gender'].values
Y_transform = gender_encoder.transform(Y)

In [9]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
param_grid = [
    {
        'estimator': [LinearRegression()]
    },
    {
        'estimator':[KNeighborsRegressor()],
        'estimator__n_neighbors': [10, 50, 100],
    },
    {
        'estimator': [DecisionTreeRegressor()],
        'estimator__max_depth': [5, 10, 100],
    }
]

for metric in [r2_score, mean_squared_error, explained_variance_score]:
    pipe = Pipeline(steps=[('estimator', LinearRegression())])
    grid = GridSearchCV(pipe, param_grid, scoring=make_scorer(metric))
    
    grid.fit(X,Y_transform)
    
    print(grid.best_score_)
    print(grid.best_params_)

0.2515066653966933
{'estimator': DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=10,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'), 'estimator__max_depth': 10}
1.2790276395667655
{'estimator': DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=100,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best'), 'estimator__max_depth': 100}
0.25231524555451185
{'estimator': DecisionTreeRegressor(ccp_alpha=0.0, crite

#### Summary

Looks like DecisionTreeRegressor of max depth 10 performed the best out of the three compared. For scores, R^2 and Explained Variance Score are almost the same with Explained Variance having a little bit higher. This could be because it uses biased variance. 

Mean squared Error on the other hand had a value of ~1.28 which isn't too bad compared to the scores of the other metrics in respect of whats considered 'good'. 

Since DecisionTreeRegressor performed the best, I will try to improve those scores using that model.

In [30]:
tree_param_grid = {
    'max_depth': [1, 5, 10, 50, 100, 300],
    'max_features': list(range(1,len(X.columns))),
    'min_samples_leaf': [5, 10, 50, 75, 100, 300],
    'min_samples_split': list(range(2,5)),
}

estimators = []

for metric in [r2_score, mean_squared_error, explained_variance_score]:
    grid_tree = GridSearchCV(DecisionTreeRegressor(), tree_param_grid, scoring=make_scorer(metric))
    
    grid_tree.fit(X, Y_transform)
    estimators.append(('DecisionTree', grid_tree.best_estimator_))
    print(grid_tree.best_params_)
    print(grid_tree.best_score_)

{'max_depth': 10, 'max_features': 4, 'min_samples_leaf': 75, 'min_samples_split': 2}
0.26574687836630156
{'max_depth': 1, 'max_features': 1, 'min_samples_leaf': 5, 'min_samples_split': 2}
1.1137710980707893
{'max_depth': 100, 'max_features': 4, 'min_samples_leaf': 100, 'min_samples_split': 2}
0.2666691875474271


Well, I managed to improve improve the R^2 by around ~.015, the Explained Variance by ~.013 and the MSE by ~.16, so not that much overall.

It is interesting that the best parameters do not mach for each metric. 

For R^2, a tree of depth 10, Max Features (the number of variables allowed to use to predict values) of 4, min sample leaf (the lowest its allowed to go for a leaf) of 75, and min samples split (min samples required to split an undetermined node) of 4. 

Explained Variance had almost the same parameters except for a max depth of 300 and a min sample split of 2. The max depth is interesting since it is really far off from the R^2's best params. 

MSE had the simpliest parameters of 1 for max depth and max features, 3 for min sample leaf, and 4 for min samples split. The theory behind the smaller values could be the graph was taking into account too much noise. As I learned from doing PCA, most of the data lives in 1 dimension making a single cut pretty effective. 

Will try to improve some more with an ensomble 

In [25]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor

In [36]:
ensemble_params = [
    {
        # bagger
        'estimator': [RandomForestRegressor()],
        'estimator__n_estimators': [50, 100, 300, 1000],
        'estimator__max_depth': [5, 50, 100, 300],
        'estimator__max_features': [2, 3, 4],
        'estimator__min_samples_leaf': [50, 100, 300],
        'estimator__n_jobs':[-1]
    }
 #   {
        # takes wayyyyy to long to run all of them, just test the bagger 
#         # booster
#         'estimator': [GradientBoostingRegressor()],
#         'estimator__learning_rate': [.01, .001],
#         'estimator__n_estimators': [50, 100, 300, 1000],
#         'estimator__min_samples_leaf': [50, 100, 300],
#         'estimator__max_depth': [5, 50, 100, 300],
#         'estimator__max_features': [2, 3, 4]
#     },
#     {
#         # Voter
#         'estimator':[VotingRegressor(estimators=estimators)],
#         'estimator__n_jobs': [-1]
#     }
]
for metric in [r2_score, mean_squared_error, explained_variance_score]:
    pipe = Pipeline(steps=[('estimator', RandomForestRegressor())])
    
    grid_ensemble = GridSearchCV(pipe, ensemble_params, scoring=make_scorer(metric))
    
    #sorry computer for what im about to do to you
    grid_ensemble.fit(X, Y_transform)
    
    print(grid_ensemble.best_score_)
    print(grid_ensemble.best_params_)


0.2797269738130592
{'estimator': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=300, max_features=4, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=50,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False), 'estimator__max_depth': 300, 'estimator__max_features': 4, 'estimator__min_samples_leaf': 50, 'estimator__n_estimators': 1000, 'estimator__n_jobs': -1}
0.9012617713983566
{'estimator': RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=5, max_features=2, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=300,
            

#### Summary
    
Looks like I was able, to no surprise, increase the scores a little more using an Ensemble method like RandomForest

R^2 and Explined Varince are almost exactly the same which is suppose to happen as MSE gets smaller, which it has. 

MSE got to ~.9 which is interesting but could be misleading. For example, Since the data is is so dense in a small space, as I confirmed using PCA, its easy to make a relative guess as to what the value is suppose to be but being accurate can be really hard. This is why the value is relatively good but will probably not get much better. 

It is worth noting that the max depth parameter, for the best estimator, for each metric, is almsot completly opposite as to what it was for a single DecisionTree. I would expect it to be lower as more models are being used. It may be because the model is overfitting. For R^2, 1000 estimators of depth 300, seems like a lot. But then again, there is a lot of data. 