# Library

In [44]:
import pandas as pd
from pandas import Series as s , DataFrame as df
import numpy as np

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
import seaborn as sb
from matplotlib import pyplot as plt, rcParams as rc

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


%matplotlib inline
rc["figure.figsize"] = 10,6


import warnings
warnings.filterwarnings("ignore") 


from sklearn.pipeline import Pipeline



from xgboost import XGBRegressor , XGBRFRegressor

import lightgbm


from catboost import CatBoostRegressor




import datetime

# Function

In [45]:
#Note : y_test = y_true and predict_x = y_pred

def mean_absolute_percentage_error(y_true, y_pred) :
    
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    return np.mean(          np.abs(  (y_true - y_pred) / 100)         ) * 100



def root_mean_sequare_error(y_true, y_pred) :
    
    mse = mean_squared_error(y_true,  y_pred)
    
    rmse = np.sqrt(mse)
    
    return rmse

# Data Import 

In [64]:
test = pd.read_csv("test_1eLl9Yf.csv")
train = pd.read_csv("train_fwYjLYX.csv")
smaple_submission = pd.read_csv("sample_submission_IIzFVsf.csv")

In [65]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
id                  180 non-null int64
application_date    180 non-null object
segment             180 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.3+ KB


In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80402 entries, 0 to 80401
Data columns (total 6 columns):
application_date    80402 non-null object
segment             80402 non-null int64
branch_id           66898 non-null float64
state               80402 non-null object
zone                66898 non-null object
case_count          80402 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 3.7+ MB


In [67]:
train.isna().sum()

application_date        0
segment                 0
branch_id           13504
state                   0
zone                13504
case_count              0
dtype: int64

In [68]:
date_pattern = "%Y-%m-%d"

In [69]:
test["year"] = test.application_date.apply(lambda x : datetime.datetime.strptime(x, date_pattern).year)
test["month"] = test.application_date.apply(lambda x : datetime.datetime.strptime(x, date_pattern).month)
test["day"] = test.application_date.apply(lambda x : datetime.datetime.strptime(x, date_pattern).day)

In [70]:
train["year"] = train.application_date.apply(lambda x : datetime.datetime.strptime(x, date_pattern).year)
train["month"] = train.application_date.apply(lambda x : datetime.datetime.strptime(x, date_pattern).month)
train["day"] = train.application_date.apply(lambda x : datetime.datetime.strptime(x, date_pattern).day)

# Data Split

In [71]:
train.head(1)

Unnamed: 0,application_date,segment,branch_id,state,zone,case_count,year,month,day
0,2017-04-01,1,1.0,WEST BENGAL,EAST,40.0,2017,4,1


In [72]:
test.head(1)

Unnamed: 0,id,application_date,segment,year,month,day
0,1,2019-07-06,1,2019,7,6


In [90]:
x_test.shape

(180, 4)

In [91]:
test_id = test.loc[:, "id"]

test_application_date = test.loc[:, "application_date"]

In [95]:
x_test = test.loc[:,   ['segment', 'year', 'month', 'day'] ]


x_train = train.loc[:,   ['segment', 'year', 'month', 'day'] ]

y_train = train.loc[:,   ['case_count'] ]


x_train.head(1)

Unnamed: 0,segment,year,month,day
0,1,2017,4,1


In [96]:
x_test.head(1)

Unnamed: 0,segment,year,month,day
0,1,2019,7,6


In [97]:

y_train.head(1)

Unnamed: 0,case_count
0,40.0


In [98]:
x_train.shape , x_test.shape , y_train.shape

((80402, 4), (180, 4), (80402, 1))

# Feature Engineer

In [59]:

# x = train.loc[:,   ['segment', 'year', 'month', 'day'] ]
# y = train.loc[:   , "case_count"]

# x_train, x_test , y_train , y_test = train_test_split(x ,y , test_size = 0.25 , random_state = 10)


In [63]:
# x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [61]:
x_train.head(1)

Unnamed: 0,segment,year,month,day
0,1,2017,4,1


In [28]:
y_train.head(1)

51573    7.0
Name: case_count, dtype: float64

# Multiple Model

In [29]:
rows=[]


def addRandomStateForAlgorithm(x,y,names,algorithms,columns_name,random_state_list):    
    for j in range(len(algorithms)):
        model = algorithms[j]
        for i in random_state_list:
            
            x_train, x_test , y_train , y_test = train_test_split(x ,y , test_size = 0.25 , random_state = i)
            
            model.fit(x_train,y_train)
            y_pred = model.predict(x_test)
            
            accuracy = r2_score(y_test, y_pred)
            accuracy = round(accuracy, 2) * 100
            
            mape = mean_absolute_percentage_error(y_test, y_pred)
            mape = round(mape, 2)
            
            mae = mean_absolute_error(y_test, y_pred)   
            mae = round(mae, 2)
            
            mse = mean_squared_error(y_test, y_pred)
            mse = round(mse, 2)
            
            rmse = root_mean_sequare_error(y_test, y_pred)
            rmse = round(rmse, 2)
            
            
            row = [names[j],   i,   accuracy,   mape,    mae,    mse,    rmse]
            
            rows.append(row)
            
    models_df = pd.DataFrame(rows) 
    
    models_df.columns = columns_name
    print(models_df)

In [30]:
# addRandomStateForAlgorithm(x, y,names_regression ,algorithms,columns_name,random_state_list_up_to_10)

## Simple Algorithm

In [31]:
names_regression_1 = ["Linear", "DT", "RF"  ]

algorithms_1 = [ LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor()]

columns_name_1 = ["Model_name",    "Random_state",   'Accuracy',     "  MAPE ",   " MAE ",   " MSE ", " RMSE "]

random_state_list_up_to_5_1 = [0]
random_state_list_up_to_10_1 = [0,1,2,3,4,5,6,7,8,9,10]
random_state_list_10_up_to_20 = [10,11,12,13,14,15,16,17,18,19,20]

In [32]:
addRandomStateForAlgorithm(x, y,names_regression_1 ,algorithms_1,columns_name_1,random_state_list_up_to_10_1)

   Model_name  Random_state  Accuracy    MAPE     MAE        MSE    RMSE 
0      Linear             0      21.0   209.07  209.07  429523.96  655.38
1      Linear             1      21.0   216.07  216.07  454422.38  674.11
2      Linear             2      21.0   212.26  212.26  450782.04  671.40
3      Linear             3      21.0   212.19  212.19  428211.19  654.38
4      Linear             4      20.0   211.17  211.17  472039.58  687.05
5      Linear             5      21.0   209.84  209.84  408609.04  639.23
6      Linear             6      22.0   211.77  211.77  436554.20  660.72
7      Linear             7      22.0   205.48  205.48  407534.85  638.38
8      Linear             8      21.0   211.10  211.10  425782.91  652.52
9      Linear             9      21.0   214.91  214.91  456000.31  675.28
10     Linear            10      21.0   211.44  211.44  459858.89  678.13
11         DT             0      23.0   179.93  179.93  420605.13  648.54
12         DT             1      22.0 

## Complex Algorithm

In [33]:
xgbRegressor = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3,
       min_child_weight=1.5, missing=None, n_estimators=10000, nthread=-1,
       objective='reg:linear', reg_alpha=0.75, reg_lambda=0.45,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.6)



lgbm = lightgbm.LGBMRegressor(boosting_type='gbdt', objective='regression', num_leaves=1200,
                                learning_rate=0.17, num_boost_round=5000,
                                metric='rmse', bagging_fraction=0.8, feature_fraction=0.8, reg_lambda=0.9)

catBoostRegressor  = CatBoostRegressor(iterations=50, depth=3, learning_rate=0.17, loss_function='RMSE')


names_regression_2 = ["XGBRegressor", "lgbm" , "catBoostRegressor"]
algorithms_2 = [  xgbRegressor, lgbm , catBoostRegressor]

columns_name_2 = ["Model_name",    "Random_state",   'Accuracy',     "  MAPE ",   " MAE ",   " MSE ", " RMSE "]


random_state_list_up_to_5 = [0]
random_state_list_up_to_3_2 = [0,1,2,3]
random_state_list_up_to_10 = [0,1,2,3,4,5,6,7,8,9,10]
random_state_list_10_up_to_20 = [10,11,12,13,14,15,16,17,18,19,20]

In [34]:
addRandomStateForAlgorithm(x, y,names_regression_2 ,algorithms_2,columns_name_2,random_state_list_up_to_3_2)

0:	learn: 720.5372380	total: 57.7ms	remaining: 2.83s
1:	learn: 697.0527144	total: 62.6ms	remaining: 1.5s
2:	learn: 679.8992501	total: 68.1ms	remaining: 1.07s
3:	learn: 667.1394285	total: 72.4ms	remaining: 832ms
4:	learn: 658.2029571	total: 76.6ms	remaining: 689ms
5:	learn: 651.1535937	total: 81.8ms	remaining: 600ms
6:	learn: 646.5331157	total: 87.4ms	remaining: 537ms
7:	learn: 643.0101306	total: 91.7ms	remaining: 481ms
8:	learn: 639.9570121	total: 97ms	remaining: 442ms
9:	learn: 637.9856929	total: 102ms	remaining: 410ms
10:	learn: 636.5674943	total: 107ms	remaining: 378ms
11:	learn: 635.3822486	total: 115ms	remaining: 363ms
12:	learn: 634.4683884	total: 120ms	remaining: 341ms
13:	learn: 633.8240197	total: 125ms	remaining: 321ms
14:	learn: 633.0647826	total: 131ms	remaining: 305ms
15:	learn: 632.5018898	total: 136ms	remaining: 290ms
16:	learn: 632.1014078	total: 141ms	remaining: 274ms
17:	learn: 631.8459871	total: 147ms	remaining: 262ms
18:	learn: 631.4918924	total: 151ms	remaining: 247

35:	learn: 627.7664964	total: 184ms	remaining: 71.4ms
36:	learn: 627.7379439	total: 192ms	remaining: 67.3ms
37:	learn: 627.6708397	total: 197ms	remaining: 62.2ms
38:	learn: 627.6170495	total: 202ms	remaining: 56.9ms
39:	learn: 627.5871245	total: 206ms	remaining: 51.5ms
40:	learn: 627.5152622	total: 211ms	remaining: 46.3ms
41:	learn: 627.4929324	total: 216ms	remaining: 41.1ms
42:	learn: 627.4571518	total: 224ms	remaining: 36.5ms
43:	learn: 627.4359282	total: 229ms	remaining: 31.2ms
44:	learn: 627.3959791	total: 234ms	remaining: 26ms
45:	learn: 627.3703511	total: 239ms	remaining: 20.8ms
46:	learn: 627.3320747	total: 243ms	remaining: 15.5ms
47:	learn: 627.2760561	total: 248ms	remaining: 10.3ms
48:	learn: 627.2572354	total: 253ms	remaining: 5.17ms
49:	learn: 627.1770308	total: 258ms	remaining: 0us
           Model_name  Random_state  Accuracy    MAPE     MAE        MSE   \
0              Linear             0      21.0   209.07  209.07  429523.96   
1              Linear             1      

# LTFS

In [100]:
x_test.head(1)

Unnamed: 0,segment,year,month,day
0,1,2019,7,6


In [101]:
x_test.shape , x_train.shape ,  y_train.shape ,y_test.shape

((180, 4), (80402, 4), (80402, 1), (20101,))

In [102]:
x_train.head(1)

Unnamed: 0,segment,year,month,day
0,1,2017,4,1


In [103]:
y_train.head(1)

Unnamed: 0,case_count
0,40.0


In [113]:
catBoostRegressorModel  = CatBoostRegressor(iterations=50, depth=3, learning_rate=0.17, loss_function='RMSE')
catBoostRegressorModel.fit(x_train, y_train)

pred_y_catboost = catBoostRegressorModel.predict(x_test)

0:	learn: 716.9182897	total: 6.01ms	remaining: 294ms
1:	learn: 693.5711609	total: 11.8ms	remaining: 284ms
2:	learn: 676.5914005	total: 17.8ms	remaining: 279ms
3:	learn: 663.7364088	total: 23.2ms	remaining: 267ms
4:	learn: 654.7320088	total: 28.9ms	remaining: 260ms
5:	learn: 647.7021337	total: 34.7ms	remaining: 255ms
6:	learn: 643.0922649	total: 40.2ms	remaining: 247ms
7:	learn: 639.5271370	total: 45.6ms	remaining: 239ms
8:	learn: 636.4917668	total: 50.8ms	remaining: 231ms
9:	learn: 634.5357271	total: 55.8ms	remaining: 223ms
10:	learn: 633.0245639	total: 61.6ms	remaining: 218ms
11:	learn: 631.8414711	total: 66.9ms	remaining: 212ms
12:	learn: 630.9106594	total: 72.7ms	remaining: 207ms
13:	learn: 630.0838644	total: 78.1ms	remaining: 201ms
14:	learn: 629.5493876	total: 83.5ms	remaining: 195ms
15:	learn: 628.9188304	total: 89.6ms	remaining: 190ms
16:	learn: 628.5219394	total: 95.2ms	remaining: 185ms
17:	learn: 628.0488517	total: 101ms	remaining: 180ms
18:	learn: 627.6746601	total: 107ms	rem

In [109]:
xgbRegressor = XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3,
       min_child_weight=1.5, missing=None, n_estimators=10000, nthread=-1,
       objective='reg:linear', reg_alpha=0.75, reg_lambda=0.45,
       scale_pos_weight=1, seed=42, silent=True, subsample=0.6)


xgbRegressor.fit(x_train, y_train)

pred_y_xgboost = xgbRegressor.predict(x_test)

In [110]:
lgbm = lightgbm.LGBMRegressor(boosting_type='gbdt', objective='regression', num_leaves=1200,
                                learning_rate=0.17, num_boost_round=5000,
                                metric='rmse', bagging_fraction=0.8, feature_fraction=0.8, reg_lambda=0.9)


lgbm.fit(x_train, y_train)

pred_y_lgbm = xgbRegressor.predict(x_test)

In [117]:
randomForestRegressor = RandomForestRegressor()
randomForestRegressor.fit(x_train , y_train)

predict_y_rf = randomForestRegressor.predict(x_test)

In [111]:
pred_y_xgboost

array([-1.75863285e+01, -1.53770232e+00, -5.46379566e+00,  3.49218059e+00,
        1.74516926e+01,  1.05374596e+02,  1.71553543e+02,  1.92352554e+02,
        1.82951111e+02,  1.81520508e+02,  1.84977692e+02,  1.96778870e+02,
        1.86522095e+02,  1.89973236e+02,  1.99825531e+02,  1.82781418e+02,
        1.80253098e+02,  1.58837357e+02,  1.47173172e+02,  1.29895706e+02,
        1.15374146e+02,  9.79785995e+01,  6.06691818e+01,  1.91116199e+01,
       -6.84409761e+00,  2.75708752e+01, -3.09823742e+01, -3.18851662e+01,
       -1.95650158e+01, -1.05492144e+01, -3.76447916e+00, -1.68058186e+01,
       -7.57253647e-01, -4.68332624e+00,  4.27267981e+00,  1.82321301e+01,
        1.06155289e+02,  1.72334229e+02,  1.93133209e+02,  1.83731873e+02,
        1.82301178e+02,  1.85758362e+02,  1.97559616e+02,  1.87302872e+02,
        1.90753983e+02,  2.00606247e+02,  1.83562180e+02,  1.81033859e+02,
        1.59618027e+02,  1.47953903e+02,  1.30675980e+02,  1.16154854e+02,
        9.87592926e+01,  

In [112]:
pred_y_lgbm

array([-1.75863285e+01, -1.53770232e+00, -5.46379566e+00,  3.49218059e+00,
        1.74516926e+01,  1.05374596e+02,  1.71553543e+02,  1.92352554e+02,
        1.82951111e+02,  1.81520508e+02,  1.84977692e+02,  1.96778870e+02,
        1.86522095e+02,  1.89973236e+02,  1.99825531e+02,  1.82781418e+02,
        1.80253098e+02,  1.58837357e+02,  1.47173172e+02,  1.29895706e+02,
        1.15374146e+02,  9.79785995e+01,  6.06691818e+01,  1.91116199e+01,
       -6.84409761e+00,  2.75708752e+01, -3.09823742e+01, -3.18851662e+01,
       -1.95650158e+01, -1.05492144e+01, -3.76447916e+00, -1.68058186e+01,
       -7.57253647e-01, -4.68332624e+00,  4.27267981e+00,  1.82321301e+01,
        1.06155289e+02,  1.72334229e+02,  1.93133209e+02,  1.83731873e+02,
        1.82301178e+02,  1.85758362e+02,  1.97559616e+02,  1.87302872e+02,
        1.90753983e+02,  2.00606247e+02,  1.83562180e+02,  1.81033859e+02,
        1.59618027e+02,  1.47953903e+02,  1.30675980e+02,  1.16154854e+02,
        9.87592926e+01,  

In [114]:
pred_y_catboost

array([  35.79762667,   35.79762667,   35.91663561,   36.24745416,
         36.73665083,   40.55412729,   40.35531525,   40.59704794,
         40.59704794,   40.59704794,   40.59704794,   40.59704794,
         40.59704794,   40.59704794,   40.59704794,   40.59704794,
         38.56498429,   37.75313497,   36.10478349,   36.99325163,
         39.15496377,   41.34940001,   44.19092159,   52.31098121,
         61.83231629,   70.21139127,   33.27788839,   35.02028552,
         35.02028552,   38.23336313,   38.23336313,   38.23336313,
         38.23336313,   38.35237207,   38.68319062,   39.1723873 ,
         42.98986375,   42.79105171,   43.0327844 ,   43.0327844 ,
         43.0327844 ,   43.0327844 ,   43.0327844 ,   43.0327844 ,
         43.0327844 ,   43.0327844 ,   43.0327844 ,   41.00072075,
         40.18887143,   38.54051995,   39.42898809,   41.59070024,
         43.78513647,   46.62665805,   54.74671768,   64.26805275,
         72.64712773,   33.8741731 ,   35.61657022,   35.61657

In [118]:
predict_y_rf

array([4.67529674e+00, 5.81002214e+00, 9.99985212e+00, 4.47134365e+00,
       1.50563713e+01, 1.64767334e+01, 1.52294447e+01, 1.91718157e+01,
       1.70491088e+01, 1.08869971e+01, 1.08291699e+01, 1.54663914e+01,
       1.49536335e+01, 1.73822268e+01, 2.05292839e+01, 1.84161187e+01,
       1.41984633e+01, 1.16785004e+01, 2.25996151e+01, 1.77539512e+01,
       1.92089383e+01, 1.90289359e+01, 1.77397133e+01, 1.48250472e+01,
       1.35735866e+01, 4.63107888e+01, 4.64790971e+01, 3.98986241e+01,
       4.48849605e+01, 5.15009011e+01, 1.56058917e-01, 4.67529674e+00,
       5.81002214e+00, 9.99985212e+00, 4.47134365e+00, 1.49008769e+01,
       1.43084877e+01, 1.06736963e+01, 1.92833542e+01, 1.83726091e+01,
       1.36692599e+01, 1.31329092e+01, 1.56701636e+01, 1.33511346e+01,
       1.40540116e+01, 2.37606653e+01, 2.09334605e+01, 1.72974613e+01,
       1.33064198e+01, 2.38613876e+01, 1.89603922e+01, 1.58517426e+01,
       2.09587259e+01, 1.85356921e+01, 1.94150515e+01, 1.41471226e+01,
      

In [122]:
pred_y_catboost.size, pred_y_lgbm.size , pred_y_xgboost.size, predict_y_rf.size 

(180, 180, 180, 180)

In [124]:
final = pd.DataFrame()

final['id'] = test_id

final['application_date'] = test_application_date

final['segment'] = test['segment']

final['case_count'] = pred_y_catboost





final.to_csv('rakesh.csv', index=False)

In [125]:
final.shape

(180, 4)

In [126]:
final.columns

Index(['id', 'application_date', 'segment', 'case_count'], dtype='object')

In [136]:
final[0:3]

Unnamed: 0,id,application_date,segment,case_count
0,1,2019-07-06,1,35.797627
1,2,2019-07-07,1,35.797627
2,3,2019-07-08,1,35.916636


In [134]:
final1 = pd.read_csv("rakesh.csv")

In [135]:
final1.head(3)

Unnamed: 0,id,application_date,segment,case_count
0,1,2019-07-06,1,35.797627
1,2,2019-07-07,1,35.797627
2,3,2019-07-08,1,35.916636
