In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
train_file = r'H:\EDVANCER_DATA_SCIENCE_PYTHON\PROJECT_PYTHON-ML\PROJECT_3\counterfeit_train.csv'
test_file = r'H:\EDVANCER_DATA_SCIENCE_PYTHON\PROJECT_PYTHON-ML\PROJECT_3\counterfeit_test.csv'


In [3]:
data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)


In [4]:
data_train.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [5]:
data_test.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level
0,HLZ81,,Area027,1983,85.5328,Antibiotics,mild,0.112747,CityLimits,Tier 3,Medium
1,ECE94,13.45,Area045,2000,257.146,OralContraceptives,mild,0.144446,DownTown,Tier 2,Unknown
2,SAD14,7.1,Area045,2000,98.1172,Antipyretics,mild,0.144221,DownTown,Tier 2,Unknown
3,EQV63,18.3,Area010,1996,135.373,Tranquilizers,mild,0.100388,MidTownResidential,Tier 3,Unknown
4,AIR10,,Area019,1983,112.8016,OralContraceptives,mild,0.022585,MidTownResidential,Tier 1,Small


In [6]:
data_test['Counterfeit_Sales'] = np.nan
data_test['Data'] = 'test'
data_train['Data'] = 'train'

In [7]:
all_data = pd.concat([data_train,data_test])

In [8]:
all_data.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales,Data
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026,train
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152,train
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092,train
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713,train
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402,train


In [9]:

all_data.drop(['Medicine_ID','Area_dist_level','DistArea_ID'],axis=1 , inplace=True)
all_data.drop(['SidEffect_Level','Active_Since'],axis=1, inplace=True)

In [10]:
all_data['Counterfeit_Weight'].isnull().sum()

1463

In [11]:
all_data['Counterfeit_Weight'] = all_data.fillna(all_data.mean())

In [12]:
 all_data['Counterfeit_Weight'] = pd.to_numeric(all_data['Counterfeit_Weight'],errors='coerce')


In [13]:
cat_col=['Medicine_Type' , 'Area_Type', 'Area_City_Type']


for col in cat_col :
    k=all_data[col].value_counts(dropna=False)
    cats=k.index[k>50][:-1]
    for cat in cats:
            name=col+'_'+cat
            all_data[name]=(all_data[col]==cat).astype(int)
        
    del all_data[col]


In [14]:
all_data['Counterfeit_Weight'] = pd.to_numeric(all_data['Counterfeit_Weight'],errors='coerce')


In [15]:
all_data_train = all_data.loc[all_data['Data'] == 'train']
all_data_test = all_data.loc[all_data['Data'] == 'test']

In [16]:
all_data_train.drop(['Data'],axis=1,inplace=True)
all_data_test.drop(['Data'],axis=1,inplace=True)


In [17]:
all_data_test.drop(['Counterfeit_Sales'],axis=1,inplace=True)

In [18]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(all_data_train,test_size=0.2,random_state=1)

In [19]:
x_train = train.drop(['Counterfeit_Sales'],axis=1)
y_train = train['Counterfeit_Sales']
x_test = test.drop(['Counterfeit_Sales'],axis=1)
y_test = test['Counterfeit_Sales']

In [20]:
gbm_params={'n_estimators':[50,100,200,500,700],
'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
'max_depth':[1,2,3,4,5,6],
'subsample':[0.5,0.8,1],
'max_features':[5,6,7,8],
}

In [21]:
gbm_params

{'n_estimators': [50, 100, 200, 500, 700],
 'learning_rate': [0.01, 0.05, 0.1, 0.4, 0.8, 1],
 'max_depth': [1, 2, 3, 4, 5, 6],
 'subsample': [0.5, 0.8, 1],
 'max_features': [5, 6, 7, 8]}

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error


In [23]:
model = GradientBoostingRegressor()

In [24]:
model

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
random_search = RandomizedSearchCV(model,scoring='neg_mean_absolute_error',
                                  param_distributions=gbm_params,
                                  cv=20,n_iter=20,
                                  n_jobs=-1,verbose=20)

In [26]:
random_search.fit(x_train,y_train)

Fitting 20 folds for each of 20 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:   48.3s
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:   49.2s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   53.4s
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   53.7s
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:   54.2s
[Paralle

[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed:  1.7min
[Paralle

RandomizedSearchCV(cv=20, error_score=nan,
                   estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                            

In [27]:
y_pred = random_search.predict(x_test)



In [28]:
y_pred


array([2727.96759469, 1635.35162871, 3400.27628056, ...,  829.73604853,
       1743.67126333,  777.9181996 ])

In [29]:
mean_absolute_error(y_test,y_pred)

732.0048280409824

In [42]:
1-(732.0048280409824/1660)

0.5590332361198902

In [31]:
final_pred = random_search.predict(all_data_test)

In [32]:
from xgboost.sklearn import XGBRegressor

In [33]:
xgb_params = {
    'gamma':[0,2,5,8,10],
    'max_depth':[2,3,4,5,6,7,8],
    'min_child_weight':[0.5,1,2,5,10]
}

In [34]:
xgb = XGBRegressor(n_estimators=25,subsample=0.8,colsample_bylevel=0.8,
                  colsample_bytree=0.8)

In [35]:
random_search = RandomizedSearchCV(xgb,param_distributions=xgb_params,
                                  n_iter=20,cv=20,scoring='neg_mean_absolute_error',
                                  n_jobs = -1, verbose = 20 )

In [36]:
random_search.fit(x_train,y_train)

Fitting 20 folds for each of 20 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed:   16.1s
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 149 tasks      | elapsed:   16.8s
[Paralle

[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 273 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 275 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 276 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 277 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 278 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 279 tasks      | elapsed:   28.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 281 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 282 tasks      | elapsed:   29.3s
[Paralle

RandomizedSearchCV(cv=20, error_score=nan,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=0.8,
                                          colsample_bynode=None,
                                          colsample_bytree=0.8, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_e...
                                          scale_pos_weight=None, subsample=0.8,
                                          tree_method=None,
                                          validate

In [37]:
y_pred = random_search.predict(x_test)


In [38]:
y_pred

array([2605.972 , 1486.8524, 3303.0059, ...,  866.4277, 1746.5271,
        670.1414], dtype=float32)

In [39]:
mean_absolute_error(y_test,y_pred)

732.1978753294298

In [40]:
1 - (732.1978753294298/1660)

0.5589169425726326

In [41]:
final_pred = random_search.predict(all_data_test)