In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import KFold,train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn import tree




In [2]:
datafile_train=r'C:/Users/aditya/Desktop/Shikha Desktop/Python/Project 3/counterfeit_train.csv'
datafile_test=r'C:/Users/aditya/Desktop/Shikha Desktop/Python/Project 3/counterfeit_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)

In [5]:
cd_train.dtypes

Area_dist_level_Medium                uint8
Area_dist_level_Small                 uint8
Area_dist_level_Unknown               uint8
Area_City_Type_Tier 2                 uint8
Area_City_Type_Tier 3                 uint8
Area_Type_DownTown                    uint8
Area_Type_Industrial                  uint8
Area_Type_MidTownResidential          uint8
SidEffect_Level_mild                  uint8
Medicine_Type_Antacids                uint8
Medicine_Type_Antibiotics             uint8
Medicine_Type_Antifungal              uint8
Medicine_Type_Antimalarial            uint8
Medicine_Type_Antipyretics            uint8
Medicine_Type_Antiseptics             uint8
Medicine_Type_Antiviral               uint8
Medicine_Type_Cardiac                 uint8
Medicine_Type_Hreplacements           uint8
Medicine_Type_Mstablizers             uint8
Medicine_Type_MuscleRelaxants         uint8
Medicine_Type_OralContraceptives      uint8
Medicine_Type_Statins                 uint8
Medicine_Type_Stimulants        

In [18]:
len(cd_train)

6818

In [19]:
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())

Medicine_ID : 1557
DistArea_ID : 10
Medicine_Type : 16
SidEffect_Level : 2
Area_Type : 4
Area_City_Type : 3
Area_dist_level : 4


In [20]:
cd_test.isnull().sum()

Medicine_ID              0
Counterfeit_Weight     297
DistArea_ID              0
Active_Since             0
Medicine_MRP             0
Medicine_Type            0
SidEffect_Level          0
Availability_rating      0
Area_Type                0
Area_City_Type           0
Area_dist_level          0
dtype: int64

In [21]:
cd_train.head(4)

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713


In [3]:
cd_train['Counterfeit_Weight']=cd_train["Counterfeit_Weight"].fillna(cd_train["Counterfeit_Weight"].mean())
cd_test['Counterfeit_Weight']=cd_test["Counterfeit_Weight"].fillna(cd_test["Counterfeit_Weight"].mean())

In [4]:
for col in ['DistArea_ID','Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level']:
    
    temp=pd.get_dummies(cd_train[col],prefix=col,drop_first=True)
    cd_train=pd.concat([temp,cd_train],1)
    cd_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(cd_test[col],prefix=col,drop_first=True)
    cd_test=pd.concat([temp,cd_test],1)
    cd_test.drop([col],1,inplace=True)


In [6]:
ld_train, ld_test = train_test_split(cd_train, test_size = 0.2,random_state=2)
ld_train.head()


ld_train.reset_index(drop=True,inplace=True)
ld_test.reset_index(drop=True,inplace=True)

In [7]:
x_train=ld_train.drop(['Medicine_ID','Counterfeit_Sales'],1)
y_train=ld_train['Counterfeit_Sales']
x_test=ld_test.drop(['Medicine_ID','Counterfeit_Sales'],1)
y_test=ld_test['Counterfeit_Sales']
x_train.shape

(5454, 37)

# Linear Regression

In [26]:
lm=LinearRegression()
lm.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
p_test=lm.predict(x_test)
print(p_test[0:10])

residual=p_test-y_test
print(y_test[0:10])

rmse_lm=np.sqrt(np.dot(residual,residual)/len(p_test))

rmse_lm

[1043.08613355  855.17923207 3887.15483343 3646.92683457 1270.41661852
 2749.29798652 1932.38727761 4276.23749614 2452.81562544 3497.40031422]
0    1047.7832
1     592.3760
2    3729.6256
3    3713.6464
4    1052.4438
5    2765.5472
6     854.0354
7    3683.0196
8    2172.9852
9    2240.8968
Name: Counterfeit_Sales, dtype: float64


1132.834263801189

In [28]:
from sklearn.metrics import r2_score
r2_score(y_test,p_test)

0.5528360623862745

In [29]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,p_test)

830.0501468413154

In [28]:
# Finding best value of penalty weight with cross validation for ridge regression
lambdas=np.linspace(.001,50,100)
# We need to reset index for cross validation to work without hitch
x_train.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_train.head

<bound method NDFrame.head of 0       1690.9460
1       6206.4016
2       1239.5336
3       1411.3100
4       1599.7314
5       1987.8928
6       3681.6880
7       1263.5024
8       2925.3392
9       4824.2008
10      2098.4156
11      2999.2430
12      4153.7402
13      2564.4756
14      3338.1352
15      4555.2176
16      2744.2416
17      1706.9252
18       676.9326
19      3112.4290
20      1846.0774
21      5872.1700
22       318.0664
23      2861.4224
24      5631.1504
25      2364.0698
26      2416.6680
27      1684.2880
28      2415.3364
29       652.2980
          ...    
5424    4544.5648
5425    3263.5656
5426    2222.2544
5427    1391.3360
5428    2699.6330
5429    1694.9408
5430    1338.0720
5431     468.5372
5432    1455.2528
5433    2086.4312
5434    5641.1374
5435     871.3462
5436     560.4176
5437    2258.2076
5438    1947.2790
5439    1271.4920
5440    4266.9262
5441    1003.8404
5442    1236.8704
5443    4493.2982
5444    4762.9472
5445    2455.9502
5446    4173.714

In [29]:
rmse_list=[]
for a in lambdas:
    ridge = Ridge(fit_intercept=True, alpha=a)

    # computing average RMSE across 10-fold cross validation
    kf = KFold(len(x_train), n_folds=10)
    xval_err = 0
    for train, test in kf:
        ridge.fit(x_train.loc[train], y_train[train])
        p = ridge.predict(x_train.loc[test])
        err = p - y_train[test]
        xval_err += np.dot(err,err)
    rmse_10cv = np.sqrt(xval_err/len(x_train))
    # uncomment below to print rmse values for individidual alphas
    print('{:.4f}\t {:.6f}\t '.format(a,rmse_10cv))
    rmse_list.extend([rmse_10cv])


0.0010	 1115.127130	 
0.5060	 1115.090904	 
1.0111	 1115.056733	 
1.5161	 1115.024417	 
2.0212	 1114.993787	 
2.5262	 1114.964698	 
3.0312	 1114.937025	 
3.5363	 1114.910660	 
4.0413	 1114.885508	 
4.5464	 1114.861486	 
5.0514	 1114.838522	 
5.5564	 1114.816550	 
6.0615	 1114.795512	 
6.5665	 1114.775357	 
7.0716	 1114.756039	 
7.5766	 1114.737516	 
8.0816	 1114.719749	 
8.5867	 1114.702705	 
9.0917	 1114.686353	 
9.5968	 1114.670662	 
10.1018	 1114.655608	 
10.6068	 1114.641166	 
11.1119	 1114.627313	 
11.6169	 1114.614030	 
12.1220	 1114.601297	 
12.6270	 1114.589096	 
13.1321	 1114.577411	 
13.6371	 1114.566226	 
14.1421	 1114.555527	 
14.6472	 1114.545301	 
15.1522	 1114.535535	 
15.6573	 1114.526217	 
16.1623	 1114.517336	 
16.6673	 1114.508881	 
17.1724	 1114.500843	 
17.6774	 1114.493211	 
18.1825	 1114.485978	 
18.6875	 1114.479134	 
19.1925	 1114.472672	 
19.6976	 1114.466584	 
20.2026	 1114.460862	 
20.7077	 1114.455500	 
21.2127	 1114.450491	 
21.7177	 1114.445829	 
22.2228	

In [30]:
alphas=np.linspace(0.01,10,20)
rmse_list=[]
for a in alphas:
    lasso = Lasso(fit_intercept=True, alpha=a,max_iter=10000)

    # computing RMSE using 10-fold cross validation
    kf = KFold(len(x_train), n_folds=10)
    xval_err = 0
    for train, test in kf:
        lasso.fit(x_train.loc[train], y_train[train])
        p =lasso.predict(x_train.loc[test])
        err = p - y_train[test]
        xval_err += np.dot(err,err)
    rmse_10cv = np.sqrt(xval_err/len(x_train))
    rmse_list.extend([rmse_10cv])
    # Uncomment below to print rmse values of individual alphas
    print('{:.4f}\t {:.4f}\t '.format(a,rmse_10cv))
best_alpha=alphas[rmse_list==min(rmse_list)]
print('Alpha with min 10cv error is : ',best_alpha )

0.0100	 1115.1085	 
0.5358	 1114.3577	 
1.0616	 1113.6929	 
1.5874	 1113.2583	 
2.1132	 1113.0021	 
2.6389	 1112.8751	 
3.1647	 1112.8291	 
3.6905	 1112.8354	 
4.2163	 1112.9124	 
4.7421	 1112.9630	 
5.2679	 1113.0204	 
5.7937	 1113.0911	 
6.3195	 1113.1773	 
6.8453	 1113.2689	 
7.3711	 1113.3804	 
7.8968	 1113.5221	 
8.4226	 1113.6847	 
8.9484	 1113.8823	 
9.4742	 1114.0797	 
10.0000	 1114.2648	 
Alpha with min 10cv error is :  [3.16473684]


# DTrees

In [53]:
import time
start_time = time.time()

param_grid1 = {'max_depth':list(range(10,81,10)),'max_features':list(range(7,20,2)),
               "max_leaf_nodes":list(range(10,100,5))}
grid = RandomSearchCV(tree.DecisionTreeRegressor(criterion="mse",random_state=2),param_grid=param_grid1,cv=10)
grid.fit(x_train,y_train)


print("--- %s seconds ---" % (time.time() - start_time))

--- -3282.1573457717896 seconds ---


In [54]:
print(grid.best_estimator_)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=15,
           max_leaf_nodes=15, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=2, splitter='best')


In [61]:
dtree=tree.DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=15,
           max_leaf_nodes=45, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=2,
           splitter='best')

In [62]:
dtree.fit(x_train,y_train)
predicted=dtree.predict(x_test)
residual=predicted-y_test
rmse_dtree=np.sqrt(np.dot(residual,residual)/len(predicted))

rmse_dtree

1196.4228174258676

# Random Forest

In [48]:
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.scorer import make_scorer

In [65]:
start_time = time.time()
rg = RandomForestRegressor(n_jobs=-1,verbose=1)

        
param_dist = {"n_estimators":[10,100,500,700],
              "max_depth": [3,5, None],
              "max_features": sp_randint(5, 11),
              "min_samples_split": sp_randint(5, 11),
              "min_samples_leaf": sp_randint(5, 11),
              "bootstrap": [True, False]}

n_iter_search = 20
random_search = RandomizedSearchCV(rg, param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   cv=10,
                                   random_state=2)
random_search.fit(x_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_j

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]:

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_job

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_j

[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    4.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    3.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks     

[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks     

[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    3.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    3.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 442 tasks     

[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.8s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks     

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 500 out of 500 | elapse

--- 638.7862992286682 seconds ---


[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    6.8s finished


In [66]:
print(random_search.best_estimator_)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           n_estimators=700, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)


In [33]:
rg = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           n_estimators=700, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [34]:
rg.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:    3.3s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           n_estimators=700, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [35]:
predicted=rg.predict(x_test)

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.2s finished


In [36]:
residual=predicted-y_test

In [38]:
rmse=np.sqrt(np.dot(residual,residual)/len(predicted))

rmse
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,predicted)


1106.7784889225493

In [75]:
cd_test.head

<bound method NDFrame.head of       Area_dist_level_Medium  Area_dist_level_Small  Area_dist_level_Unknown  \
0                          1                      0                        0   
1                          0                      0                        1   
2                          0                      0                        1   
3                          0                      0                        1   
4                          0                      1                        0   
5                          0                      0                        1   
6                          1                      0                        0   
7                          0                      1                        0   
8                          1                      0                        0   
9                          0                      0                        1   
10                         1                      0                        0   
11        

In [22]:
prediction=rg.predict(cd_test.drop(['Medicine_ID'],1))

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.2s finished


In [25]:
submission=pd.DataFrame(list(zip(cd_test['Medicine_ID'],list(prediction))),
                       columns=['Medicine_ID','Counterfeit_Sales'])

In [26]:
submission.head(4)

Unnamed: 0,Medicine_ID,Counterfeit_Sales
0,HLZ81,2441.301114
1,ECE94,3618.850621
2,SAD14,1621.745977
3,EQV63,377.3757


In [27]:
submission.to_csv('Shikha_Agarwal_P3.csv',index=False)

# Boosting:

In [47]:
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.model_selection import RandomizedSearchCV,train_test_split,GridSearchCV
help(GradientBoostingRegressor)

Help on class GradientBoostingRegressor in module sklearn.ensemble.gradient_boosting:

class GradientBoostingRegressor(BaseGradientBoosting, sklearn.base.RegressorMixin)
 |  Gradient Boosting for regression.
 |  
 |  GB builds an additive model in a forward stage-wise fashion;
 |  it allows for the optimization of arbitrary differentiable loss functions.
 |  In each stage a regression tree is fit on the negative gradient of the
 |  given loss function.
 |  
 |  Read more in the :ref:`User Guide <gradient_boosting>`.
 |  
 |  Parameters
 |  ----------
 |  loss : {'ls', 'lad', 'huber', 'quantile'}, optional (default='ls')
 |      loss function to be optimized. 'ls' refers to least squares
 |      regression. 'lad' (least absolute deviation) is a highly robust
 |      loss function solely based on order information of the input
 |      variables. 'huber' is a combination of the two. 'quantile'
 |      allows quantile regression (use `alpha` to specify the quantile).
 |  
 |  learning_rate

In [48]:
gbrt=GradientBoostingRegressor(n_estimators=100,random_state=2) 
gbrt.fit(x_train, y_train) 
y_pred=gbrt.predict(x_test) 

In [49]:
residual=y_pred-y_test
rmse_gbrt=np.sqrt(np.dot(residual,residual)/len(y_test))

rmse_gbrt

1108.0055326250874

In [50]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.5722225599273598

In [51]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred)

764.3673832027923

In [52]:
param_dist = { 
              "max_depth": [2,3,4,5],
              "learning_rate":[0.005,0.01,0.05,0.1],
              "subsample":[.8],
    "n_estimators":[50,100,500,700,800,900],
    'min_samples_split': [30,40,50,60]
              }


In [54]:
n_iter=10

random_search=RandomizedSearchCV(gbrt,n_jobs=-1,verbose=20,cv=10,param_distributions=param_dist)

In [55]:
random_search.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   33.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   35.5s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   41.3s
[Paralle

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=2,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'max_depth': [2, 3, 4, 5], 'learning_rate': [0.01, 0.05, 0.1], 'subsample': [0.8], 'n_estimators': [50, 100, 500], 'min_samples_split': [30, 40, 50, 60]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=20)

In [56]:
print(random_search.best_estimator_)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=50, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=2,
             subsample=0.8, verbose=0, warm_start=False)


In [64]:
gbrt_best=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=50, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=2,
             subsample=0.8, verbose=0, warm_start=False)
gbrt_best.fit(x_train, y_train) 
y_pred=gbrt_best.predict(x_test) 

In [73]:
residual=y_pred-y_test
rmse_gbrt=np.sqrt(np.dot(residual,residual)/len(y_test))

rmse_gbrt

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred)

from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.583574048632139

In [66]:
prediction=gbrt_best.predict(cd_test.drop(['Medicine_ID'],1))

In [67]:
submission=pd.DataFrame(list(zip(cd_test['Medicine_ID'],list(prediction))),
                       columns=['Medicine_ID','Counterfeit_Sales'])

In [69]:
submission.head()

Unnamed: 0,Medicine_ID,Counterfeit_Sales
0,HLZ81,2308.914789
1,ECE94,3910.918692
2,SAD14,1492.739673
3,EQV63,490.371704
4,AIR10,483.775142


In [70]:
submission.to_csv('Shikha_Agarwal_GBM_P3.csv',index=False)

# SVM

In [12]:
from sklearn import svm

In [21]:
param_grid1 = {'C':[0.01,0.1,1.0],"kernel":['linear']}
grid = GridSearchCV(svm.SVR(),param_grid=param_grid1,cv=10)


In [22]:
grid.fit(x_train,y_train)
print(grid.best_estimator_)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [None]:
svr_rbf = svm.SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_lin = svm.SVR(kernel='linear', C=1e3)
svr_poly = svm.SVR(kernel='poly', C=1e3, degree=3)
y_rbf = svr_rbf.fit(x_train,y_train).predict(x_test)
y_lin = svr_lin.fit(x_train,y_train).predict(x_test)
y_poly = svr_poly.fit(x_train,y_train).predict(x_test)

In [14]:
from sklearn.metrics import mean_squared_error
rmse_svm=np.sqrt(mean_squared_error(y_test,sr.predict(x_test)))

rmse_svm

1726.8229080903416

# KNN

In [8]:
from sklearn.neighbors import KNeighborsRegressor

In [13]:
param_grid1 = {'n_neighbors':list(range(5,10,15)),'p':[1,2,3],
              'weights':['uniform','distance'],'algorithm' : ['auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’]}
grid = RandomSearchCV(KNeighborsRegressor(),param_grid=param_grid1,cv=10)
grid.fit(x_train,y_train)

SyntaxError: EOL while scanning string literal (<ipython-input-13-a7562588bcce>, line 2)

In [None]:
print(grid.best_estimator_)

In [9]:
knn=KNeighborsRegressor()
knn.fit(x_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [10]:
predicted=knn.predict(x_test)

In [11]:
residual=predicted-y_test

In [12]:
rmse_knn=np.sqrt(np.dot(residual,residual)/len(predicted))

rmse_knn

1220.071416389309

# Stacking code

In [50]:
clf1=LinearRegression()
clf2=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=8, min_weight_fraction_leaf=0.0,
           n_estimators=700, n_jobs=-1, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

clf3=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=50, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=2,
             subsample=0.8, verbose=0, warm_start=False)


Algos=[clf1,clf2,clf3]

In [32]:
rows=x_train.shape[0]

layer1=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows),'clf3':np.zeros(rows)})
layer1

Unnamed: 0,clf1,clf2,clf3
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [44]:
from sklearn.model_selection import KFold, train_test_split
kf=KFold(n_splits=10)

In [52]:
fold=1
for train,test in kf.split(x_train):
    print('fold number : ', fold)
    
    for i,clf in enumerate(Algos):
        print('Algo number :',i+1)
        
        x_train_train=x_train.loc[train]
        y_train_train=y_train[train]
        x_train_test=x_train.loc[test]
        
        clf.fit(x_train_train,y_train_train)
        p=clf.predict(x_train_test)
        
        layer1.iloc[test,i]=p
        
    fold+=1  
    

fold number :  1
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   10.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    1.0s finished


Algo number : 3
fold number :  2
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   16.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.9s finished


Algo number : 3
fold number :  3
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   12.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.4s finished


Algo number : 3
fold number :  4
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   14.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.6s finished


Algo number : 3
fold number :  5
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   11.3s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.7s finished


Algo number : 3
fold number :  6
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   13.2s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.5s finished


Algo number : 3
fold number :  7
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   15.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.4s finished


Algo number : 3
fold number :  8
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   13.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.7s finished


Algo number : 3
fold number :  9
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   12.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    1.1s finished


Algo number : 3
fold number :  10
Algo number : 1
Algo number : 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   17.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    0.6s finished


Algo number : 3


In [53]:
layer1

Unnamed: 0,clf1,clf2,clf3
0,2658.154445,2729.435396,2621.877081
1,3796.910006,3712.789090,3963.827291
2,1136.020990,595.727669,636.310112
3,2460.203973,1915.018600,1542.873365
4,3403.583384,3577.971234,3688.190322
5,2104.798541,2059.456470,1999.516940
6,2943.195622,3295.956713,3023.007744
7,2329.404479,2073.343355,2164.177080
8,2245.127713,2111.512272,2046.724656
9,2668.131969,2788.225957,2659.073457


In [56]:
rows=x_test.shape[0]
layer2_test=pd.DataFrame({'clf1':np.zeros(rows),'clf2':np.zeros(rows),'clf3':np.zeros(rows)})
for i,clf in enumerate(Algos):
    print( 'Algo number',i+1)
    clf.fit(x_train,y_train)
    p=clf.predict(x_test)
    
    layer2_test.iloc[:,i]=p


Algo number 1
Algo number 2


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed:   17.5s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 700 out of 700 | elapsed:    1.4s finished


Algo number 3


In [92]:
# second layer linear model 
gbrt_stk=GradientBoostingRegressor(n_estimators=100,random_state=2) 

In [94]:
gbrt_stk.fit=(layer1,y_train)

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.8s finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=1, warm_start=False)

In [95]:
p_test=gbrt_stk.predict(layer2_test)

residual=p_test-y_test


rmse_lm=np.sqrt(np.dot(residual,residual)/len(p_test))

rmse_lm

from sklearn.metrics import r2_score
r2_score(y_test,p_test)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred)

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.2s finished


1186.4625764784857


822.6936222580646