# Prediction with various Machine Learning Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn import *



In [3]:
# Reading in pre-processed data
train = pd.read_csv('clean_train.csv', encoding="ISO-8859-1")

In [4]:
train.drop(labels=['shop_name','item_name','item_category_name'], axis=1, inplace=True)

In [5]:
# Reading in Test data (data whose sales amount is what we need to predict)
test = pd.read_csv('clean_test.csv', encoding="ISO-8859-1")

In [6]:
test.drop(labels=['shop_name','item_name','item_category_name'], axis=1, inplace=True)

In [7]:
col = [c for c in train.columns if c not in ['item_cnt_month','ID']]

#Validation
x1 = train[train['date_block_num']<33]
y1 = np.log1p(x1['item_cnt_month'].clip(0.,20.))
x1 = x1[col]

x2 = train[train['date_block_num']==33]
y2 = np.log1p(x2['item_cnt_month'].clip(0.,20.))
x2 = x2[col]

## Linear Regression

In [7]:
lin = linear_model.LinearRegression()
lin.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),lin.predict(x2).clip(0.,20.))))

RMSE: 0.400294496205


In [14]:
lin.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = lin.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('linear_reg.csv', index=False)

## Extra Trees Regressor

In [16]:
extratrees = ensemble.ExtraTreesRegressor(n_estimators=25, n_jobs=-1, max_depth=15, random_state=18)
extratrees.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),extratrees.predict(x2).clip(0.,20.))))

RMSE: 0.298586931919


In [17]:
extratrees.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = extratrees.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('extra_trees.csv', index=False)

## Lasso LARS

In [18]:
lassolars = linear_model.LassoLars(alpha=0.01)
lassolars.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),lassolars.predict(x2).clip(0.,20.))))

RMSE: 0.437092029485


In [19]:
lassolars.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = lassolars.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('lasso_lars.csv', index=False)

## SGD Regressor

In [15]:
sgd_reg = linear_model.SGDRegressor(random_state=42)
sgd_reg.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),sgd_reg.predict(x2).clip(0.,20.))))



RMSE: 7.81039432528


In [16]:
sgd_reg.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = sgd_reg.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('sgd_reg.csv', index=False)



## Passive Aggressive Regressor

In [17]:
pass_agg_reg = linear_model.PassiveAggressiveRegressor(random_state=42)
pass_agg_reg.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),pass_agg_reg.predict(x2).clip(0.,20.))))



RMSE: 0.469601300906


In [18]:
pass_agg_reg.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = pass_agg_reg.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('pass_agg_reg.csv', index=False)



## Decision Tree Regressor

In [8]:
dec_tree = tree.DecisionTreeRegressor(random_state=42, max_depth=3)
dec_tree.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),dec_tree.predict(x2).clip(0.,20.))))

RMSE: 0.313631350404


In [9]:
dec_tree.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = dec_tree.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('dec_tree.csv', index=False)

## AdaBoost Regressor

In [12]:
adaboost = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(max_depth=3),
                          n_estimators=100, random_state=42)
adaboost.fit(x1,y1)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2.clip(0.,20.),adaboost.predict(x2).clip(0.,20.))))

RMSE: 0.335047754266


In [13]:
adaboost.fit(train[col],train['item_cnt_month'].clip(0.,20.))
test['item_cnt_month'] = adaboost.predict(test[col]).clip(0.,20.)
test[['ID','item_cnt_month']].to_csv('adaboost.csv', index=False)