In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
%matplotlib inline
#import catboost
#from catboost import CatBoostRegressor


from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LogisticRegression
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor

import xgboost as xgb

In [2]:
X_train = pd.read_csv('/Users/pol.molinas/Downloads/X_train.csv')
Y_train = pd.read_csv('/Users/pol.molinas/Downloads/Y_train.csv')
X_test = pd.read_csv('/Users/pol.molinas/Downloads/X_test.csv')

In [3]:
test = pd.read_csv('/Users/pol.molinas/Downloads/test.csv')

In [27]:
Y_train = np.asarray(Y_train.item_cnt_month)

In [None]:
Y_train.shape

In [None]:
gb = make_pipeline(RobustScaler(), GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5))
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.5, random_state=1,max_iter=5000))
model_xgb = make_pipeline(RobustScaler(), xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1))

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1) 

# Sub 1

In [28]:
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
rf_model.fit(X_train, Y_train)
rf_test_pred = rf_model.predict(X_test)

In [None]:
param_grid = {'n_estimators':[20,50,100],'max_depth':[3,7,9]}
search_rf = GridSearchCV(RandomForestRegressor(),
                       param_grid = param_grid
                       ,cv=3)
search_rf.fit(X_train,Y_train)
search_rf.best_params_
rf_search_pred = search_rf.predict(X_test)

In [None]:
param_grid = {'penalty':['l1','l2']}
search_logreg = GridSearchCV( LogisticRegression() , param_grid = param_grid, cv=3 )
search_logreg.fit(X_train,Y_train)
search_logreg.best_params_
logreg_search_pred = search_logreg.predict(X_test)

# Sub 2

In [None]:
from sklearn.preprocessing import StandardScaler
ENet = make_pipeline(StandardScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
ENet.fit(X_train,Y_train)
enet_pred = ENet.predict(X_test)

# Sub 3

In [None]:
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
ENet = make_pipeline(StandardScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
avg = AveragingModels(models = [rf_model,ENet])
avg.fit(X_train,Y_train)
rf_enet_pred = avg.predict(X_test)

# Sub 4

In [6]:
pca = PCA()
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
pca.explained_variance_ratio_.cumsum()

array([0.85261804, 0.97450666, 0.9992311 , 0.99998641, 0.99999894,
       0.99999994, 0.99999998, 0.99999999, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        ])

In [30]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

In [None]:
xgb_model = xgb.XGBRegressor(max_depth=8, 
                         n_estimators=500, 
                         min_child_weight=1000,  
                         colsample_bytree=0.7, 
                         subsample=0.7, 
                         eta=0.3, 
                         seed=0)
xgb_model.fit(X_train_pca,Y_train)
xgb_pred = xgb_model.predict(X_test_pca)

# Sub 5

In [23]:
knn_model = KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1)
knn_model.fit(X_train_pca, Y_train)

knn_pred = knn_model.predict(X_test_pca)

# Sub 6

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=9, leaf_size=13, n_jobs=-1)
knn_model.fit(X_train, Y_train)

knn_pred = knn_model.predict(X_test)

In [None]:
prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = rf_test_pred
prediction_df.to_csv('submission.csv', index=False)
prediction_df.head(10)

In [None]:
prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = rf_enet_pred
prediction_df.to_csv('submission3.csv', index=False)
prediction_df.head(10)

In [None]:
prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = xgb_pred
prediction_df.to_csv('submission4.csv', index=False)
prediction_df.head(10)

In [24]:
prediction_df = pd.DataFrame(test['ID'], columns=['ID'])
prediction_df['item_cnt_month'] = knn_pred
prediction_df.to_csv('submission5.csv', index=False)
prediction_df.head(10)

Unnamed: 0,ID,item_cnt_month
0,0,0.777778
1,1,0.0
2,2,2.777778
3,3,0.0
4,4,0.0
5,5,1.888889
6,6,1.666667
7,7,0.222222
8,8,3.0
9,9,0.0
