In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

In [2]:
from matplotlib import rcParams
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (8, 3)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'



In [43]:
train = pd.read_csv('modified_train.csv', parse_dates=['Day.Index'], dayfirst=True)
test = pd.read_csv('modified_test.csv', parse_dates=['Day.Index'], dayfirst=True)
data = pd.read_csv('modified_data.csv', parse_dates=['Day.Index'], dayfirst=True)
ss = pd.read_csv('samplesubmission.csv')

In [5]:
train.head()

Unnamed: 0,Day.Index,Campaign,Impressions,Target.Clicks,Obt.Click,Min.CPC,Max.CPC,CTR,Amount.Spend,Returning.Users,...,Avg.Time.Page,Avg.Position,Total.Revenue,Obtained.Leads,Year,Month,Week,Day_Of_Week,Day,Day_Of_Year
0,2013-01-01,1,13930,225,200,28,31,1.44,6200,36,...,571,5,2401,20,2013,1,1,1,1,1
1,2013-01-02,1,11177,225,288,28,31,2.58,8352,27,...,447,3,4033,18,2013,1,1,2,2,2
2,2013-01-03,1,12968,225,211,28,31,1.63,5908,31,...,322,1,4637,13,2013,1,1,3,3,3
3,2013-01-04,1,13850,225,238,28,31,1.72,6664,38,...,534,2,3597,26,2013,1,1,4,4,4
4,2013-01-05,1,12346,225,259,28,31,2.1,7511,41,...,481,8,1923,18,2013,1,1,5,5,5


In [6]:
train.columns

Index(['Day.Index', 'Campaign', 'Impressions', 'Target.Clicks', 'Obt.Click',
       'Min.CPC', 'Max.CPC', 'CTR', 'Amount.Spend', 'Returning.Users',
       'Targeted.Leads', 'SpecialDay', 'Avg. Bounce.Rate', 'Avg.Time.Page',
       'Avg.Position', 'Total.Revenue', 'Obtained.Leads', 'Year', 'Month',
       'Week', 'Day_Of_Week', 'Day', 'Day_Of_Year'],
      dtype='object')

In [7]:
def prepare_data(df, is_train):
    df_month = pd.get_dummies(df['Month'], prefix='month')
    df = pd.concat([df, df_month], axis=1)

    if is_train:
        return df.drop(['Obtained.Leads','Day.Index'], axis=1), df['Obtained.Leads']
    return df.drop(['Day.Index'], axis=1)

In [9]:
train_features, train_target = prepare_data(train, 1)

In [10]:
test_features = prepare_data(test, 0)

In [13]:
train_features.columns

Index(['Campaign', 'Impressions', 'Target.Clicks', 'Obt.Click', 'Min.CPC',
       'Max.CPC', 'CTR', 'Amount.Spend', 'Returning.Users', 'Targeted.Leads',
       'SpecialDay', 'Avg. Bounce.Rate', 'Avg.Time.Page', 'Avg.Position',
       'Total.Revenue', 'Year', 'Month', 'Week', 'Day_Of_Week', 'Day',
       'Day_Of_Year', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12'],
      dtype='object')

In [17]:
train_features.head()

Unnamed: 0,Campaign,Impressions,Target.Clicks,Obt.Click,Min.CPC,Max.CPC,CTR,Amount.Spend,Returning.Users,Targeted.Leads,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1,13930,225,200,28,31,1.44,6200,36,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,11177,225,288,28,31,2.58,8352,27,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,12968,225,211,28,31,1.63,5908,31,14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,13850,225,238,28,31,1.72,6664,38,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,12346,225,259,28,31,2.1,7511,41,18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
train_target.head()

0    20
1    18
2    13
3    26
4    18
Name: Obtained.Leads, dtype: int64

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [27]:

# Tuning hyper-parameters for RMSE

# Set the parameters by cross-validation
#n_range = range(100, 150, 50)
tuned_parameters = {'n_estimators': [100,150]}

rf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring='mean_squared_error')
rf.fit(train_features, train_target)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 150]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='mean_squared_error', verbose=0)

In [30]:
print (rf.grid_scores_)
print ('Average Error = ' + str(np.sqrt(np.abs(rf.grid_scores_[0][1]))))
print ('Min Error = ' + str(np.sqrt(np.abs(rf.best_score_))))
print ('Best parameters = ')
#print (str(np.sqrt(np.abs(abrf.best_params_))))
print (rf.best_estimator_)

[mean: -6.83631, std: 1.94767, params: {'n_estimators': 100}, mean: -6.79975, std: 1.89336, params: {'n_estimators': 150}]
Average Error = 2.61463470377
Min Error = 2.60763328095
Best parameters = 
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


In [31]:
predicted = rf.predict(test_features)

In [34]:
predicted[0:5]

array([ 16.38      ,  12.62      ,   8.35333333,   7.92      ,   7.74      ])

In [42]:
predicted = np.round(predicted, decimals=0).astype(int)
predicted[0:5]

array([16, 13,  8,  8,  8])

In [45]:
solution = pd.DataFrame(ss['Day.Index'])

In [46]:
solution = pd.concat([solution, pd.DataFrame(predicted, columns=['Obtained.Leads'])], axis=1)

In [47]:
solution

Unnamed: 0,Day.Index,Obtained.Leads
0,01-04-2015,16
1,02-04-2015,13
2,03-04-2015,8
3,04-04-2015,8
4,05-04-2015,8
5,06-04-2015,15
6,07-04-2015,15
7,08-04-2015,9
8,09-04-2015,6
9,10-04-2015,6


In [48]:
solution.to_csv('solution.csv', index=False)

In [None]:
#no