# Counterfeit Medicines Sales Prediction

### Data Loading and pre processing

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
datafile_train=r"C:\Users\isaac\Documents\Python Scripts\Project\Project 3 Public Safety\counterfeit_train.csv"
datafile_test=r"C:\Users\isaac\Documents\Python Scripts\Project\Project 3 Public Safety\counterfeit_test.csv"
bd_train=pd.read_csv(datafile_train)
bd_test=pd.read_csv(datafile_test)

In [5]:
bd_train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
dtype: object

In [4]:
bd_train.shape

(6818, 12)

In [6]:
bd_test.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
dtype: object

In [7]:
bd_test.shape

(1705, 11)

In [None]:
# drop Medicine_ID
# get dummies - DistArea_ID, Medicine_Type, SidEffect_Level, Area_Type, Area_City_Type, Area_dist_level
# Counterfeit_Weight missing values replace with Median

In [None]:
bd_train['Area_dist_level'].value_counts

In [8]:
for col in [ 'Medicine_ID']:
    bd_train.drop(col,1,inplace=True)
    bd_test.drop(col,1,inplace=True)

In [9]:
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from mypipes import *

In [10]:
p1=pdPipeline([
    ('var_select',VarSelector(['Counterfeit_Weight','Active_Since','Medicine_MRP','Availability_rating'])),
    ('missing_trt',DataFrameImputer())
])
p2=pdPipeline([
    ('var_select',VarSelector(['DistArea_ID', 'Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level'])),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(400))
])

In [11]:
data_pipe=FeatureUnion([
    ('num_var',p1),
    ('obj_to_dum',p2)
])

In [12]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(bd_train),
                     columns=data_pipe.get_feature_names())

In [13]:
x_train.shape

(6818, 30)

In [14]:
x_test=pd.DataFrame(data=data_pipe.transform(bd_test),
                     columns=data_pipe.get_feature_names())

In [15]:
x_test.shape

(1705, 30)

In [16]:
x_train.head()

Unnamed: 0,num_var__Counterfeit_Weight,num_var__Active_Since,num_var__Medicine_MRP,num_var__Availability_rating,obj_to_dum__DistArea_ID_Area017,obj_to_dum__DistArea_ID_Area013,obj_to_dum__DistArea_ID_Area046,obj_to_dum__DistArea_ID_Area035,obj_to_dum__DistArea_ID_Area049,obj_to_dum__DistArea_ID_Area045,...,obj_to_dum__Medicine_Type_Tranquilizers,obj_to_dum__SidEffect_Level_mild,obj_to_dum__Area_Type_DownTown,obj_to_dum__Area_Type_MidTownResidential,obj_to_dum__Area_Type_CityLimits,obj_to_dum__Area_City_Type_Tier 3,obj_to_dum__Area_City_Type_Tier 2,obj_to_dum__Area_dist_level_Medium,obj_to_dum__Area_dist_level_Unknown,obj_to_dum__Area_dist_level_Small
0,13.1,1995.0,160.2366,0.070422,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,13.8,1983.0,110.4384,0.013,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,9.025,1995.0,259.4092,0.060783,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,11.8,1995.0,99.983,0.065555,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,13.8,1983.0,56.4402,0.248859,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
x_test.head()

Unnamed: 0,num_var__Counterfeit_Weight,num_var__Active_Since,num_var__Medicine_MRP,num_var__Availability_rating,obj_to_dum__DistArea_ID_Area017,obj_to_dum__DistArea_ID_Area013,obj_to_dum__DistArea_ID_Area046,obj_to_dum__DistArea_ID_Area035,obj_to_dum__DistArea_ID_Area049,obj_to_dum__DistArea_ID_Area045,...,obj_to_dum__Medicine_Type_Tranquilizers,obj_to_dum__SidEffect_Level_mild,obj_to_dum__Area_Type_DownTown,obj_to_dum__Area_Type_MidTownResidential,obj_to_dum__Area_Type_CityLimits,obj_to_dum__Area_City_Type_Tier 3,obj_to_dum__Area_City_Type_Tier 2,obj_to_dum__Area_dist_level_Medium,obj_to_dum__Area_dist_level_Unknown,obj_to_dum__Area_dist_level_Small
0,13.8,1983.0,85.5328,0.112747,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,13.45,2000.0,257.146,0.144446,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,7.1,2000.0,98.1172,0.144221,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,18.3,1996.0,135.373,0.100388,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,13.8,1983.0,112.8016,0.022585,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
y_train=bd_train['Counterfeit_Sales']

### Splitting the training data for training and validation

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train = pd.concat([x_train, bd_train['Counterfeit_Sales']],axis =1)

In [21]:
x_train.head()

Unnamed: 0,num_var__Counterfeit_Weight,num_var__Active_Since,num_var__Medicine_MRP,num_var__Availability_rating,obj_to_dum__DistArea_ID_Area017,obj_to_dum__DistArea_ID_Area013,obj_to_dum__DistArea_ID_Area046,obj_to_dum__DistArea_ID_Area035,obj_to_dum__DistArea_ID_Area049,obj_to_dum__DistArea_ID_Area045,...,obj_to_dum__SidEffect_Level_mild,obj_to_dum__Area_Type_DownTown,obj_to_dum__Area_Type_MidTownResidential,obj_to_dum__Area_Type_CityLimits,obj_to_dum__Area_City_Type_Tier 3,obj_to_dum__Area_City_Type_Tier 2,obj_to_dum__Area_dist_level_Medium,obj_to_dum__Area_dist_level_Unknown,obj_to_dum__Area_dist_level_Small,Counterfeit_Sales
0,13.1,1995.0,160.2366,0.070422,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1775.5026
1,13.8,1983.0,110.4384,0.013,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3069.152
2,9.025,1995.0,259.4092,0.060783,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2603.092
3,11.8,1995.0,99.983,0.065555,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1101.713
4,13.8,1983.0,56.4402,0.248859,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,158.9402


In [22]:
a_train, a_test = train_test_split(x_train, test_size = 0.2, random_state=42)

In [23]:
x_train1=a_train.drop(['Counterfeit_Sales'],axis=1)
y_train1=a_train['Counterfeit_Sales']

In [24]:
x_train_test1=a_test.drop(['Counterfeit_Sales'],axis=1)

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Lasso Regerssion

In [None]:
params={'alpha':np.linspace(0.1,10,50)}

In [None]:
lasmodel=Lasso(fit_intercept=True)

In [None]:
grid_search=GridSearchCV(lasmodel,
                         cv=10,
                         param_grid=params,
                         n_jobs=-1,
                         verbose=10,
                         scoring='neg_mean_absolute_error')

In [None]:
grid_search.fit(x_train1,y_train1)

In [None]:
grid_search.best_estimator_

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(grid_search.cv_results_,5)

In [None]:
lasso_model=grid_search.best_estimator_

In [None]:
lasso_model.fit(x_train1,y_train1)

In [None]:
lasso_model.intercept_

In [None]:
predlas=lasso_model.predict(x_train_test1)

In [37]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
MAE = mean_absolute_error(a_test['Counterfeit_Sales'],predlas)

In [None]:
MAE

In [None]:
Score = 1-(MAE/1660)

In [None]:
Score

### Lasso regression with random search

In [None]:
params={'alpha':np.linspace(0.1,10,50)}

In [None]:
lasrdmodel=Lasso(fit_intercept=True)

In [None]:
random_search=RandomizedSearchCV(lasrdmodel,
                                 cv=10,
                                 param_distributions=params,
                                 n_jobs=-1,
                                 verbose=10,
                                 scoring='neg_mean_absolute_error')

In [None]:
random_search.fit(x_train1,y_train1)

In [None]:
random_search.best_estimator_

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(random_search.cv_results_,5)

In [None]:
lassors_model=random_search.best_estimator_

In [None]:
lassors_model.fit(x_train1,y_train1)

In [None]:
predlasrs=lassors_model.predict(x_train_test1)

In [None]:
MAE = mean_absolute_error(a_test['Counterfeit_Sales'],predlasrs)

In [None]:
MAE

In [None]:
Score = 1-(MAE/1660)

In [None]:
Score

In [26]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

### Decision Tree

In [27]:
params={ 
        'max_depth':[None,5,10,15,20],
            'min_samples_leaf':[1,2,5], 
            'min_samples_split':[2,5,8]
       }

In [28]:
reg=DecisionTreeRegressor()

In [29]:
random_search=RandomizedSearchCV(reg,
                                 cv=10,
                                 param_distributions=params,
                                 scoring='neg_mean_absolute_error',
                                 n_iter=10,n_jobs=-1,verbose=20
                                    )

In [30]:
random_search.fit(x_train1,y_train1)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


RandomizedSearchCV(cv=10, estimator=DecisionTreeRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': [None, 5, 10, 15, 20],
                                        'min_samples_leaf': [1, 2, 5],
                                        'min_samples_split': [2, 5, 8]},
                   scoring='neg_mean_absolute_error', verbose=20)

In [31]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [32]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: -758.92778 (std: 27.72597)
Parameters: {'min_samples_split': 8, 'min_samples_leaf': 5, 'max_depth': 5}

Model with rank: 2
Mean validation score: -759.50043 (std: 28.25240)
Parameters: {'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 5}

Model with rank: 2
Mean validation score: -759.50043 (std: 28.25240)
Parameters: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 5}

Model with rank: 4
Mean validation score: -812.46445 (std: 39.57510)
Parameters: {'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 10}

Model with rank: 5
Mean validation score: -812.46586 (std: 39.57441)
Parameters: {'min_samples_split': 8, 'min_samples_leaf': 5, 'max_depth': 10}



In [33]:
dtree=DecisionTreeRegressor(**{'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5})

In [34]:
dtree.fit(x_train1,y_train1)

DecisionTreeRegressor(max_depth=5)

In [35]:
pred_dtree=dtree.predict(x_train_test1)

In [38]:
mean_absolute_error(a_test['Counterfeit_Sales'],pred_dtree)

756.6654928864681

In [39]:
MAE = mean_absolute_error(a_test['Counterfeit_Sales'],pred_dtree)

In [40]:
Score = 1-(MAE/1660)

### best score so far

In [41]:
Score

0.5441774139238145

In [42]:
x_train

Unnamed: 0,num_var__Counterfeit_Weight,num_var__Active_Since,num_var__Medicine_MRP,num_var__Availability_rating,obj_to_dum__DistArea_ID_Area017,obj_to_dum__DistArea_ID_Area013,obj_to_dum__DistArea_ID_Area046,obj_to_dum__DistArea_ID_Area035,obj_to_dum__DistArea_ID_Area049,obj_to_dum__DistArea_ID_Area045,...,obj_to_dum__SidEffect_Level_mild,obj_to_dum__Area_Type_DownTown,obj_to_dum__Area_Type_MidTownResidential,obj_to_dum__Area_Type_CityLimits,obj_to_dum__Area_City_Type_Tier 3,obj_to_dum__Area_City_Type_Tier 2,obj_to_dum__Area_dist_level_Medium,obj_to_dum__Area_dist_level_Unknown,obj_to_dum__Area_dist_level_Small,Counterfeit_Sales
0,13.100,1995.0,160.2366,0.070422,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1775.5026
1,13.800,1983.0,110.4384,0.013000,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,3069.1520
2,9.025,1995.0,259.4092,0.060783,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2603.0920
3,11.800,1995.0,99.9830,0.065555,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1101.7130
4,13.800,1983.0,56.4402,0.248859,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,158.9402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6813,8.535,1995.0,204.1452,0.112963,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2070.4520
6814,20.650,1995.0,235.1088,0.131103,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2126.3792
6815,20.000,2005.0,193.6292,0.105096,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2119.7212
6816,10.180,2000.0,162.8682,0.099957,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1485.2138


In [43]:
x_train=x_train.drop(['Counterfeit_Sales'],axis=1)

In [44]:
dtree.fit(x_train,y_train)

DecisionTreeRegressor(max_depth=5)

In [45]:
pred_dtree=dtree.predict(x_test)

In [48]:
test_pred_res = pd.DataFrame(pred_dtree)

In [46]:
data_test=r"C:\Users\isaac\Documents\Python Scripts\Project\Project 3 Public Safety\counterfeit_test.csv"
test_data=pd.read_csv(data_test)

In [49]:
final_res=pd.concat([test_data['Medicine_ID'],test_pred_res],axis=1)

In [50]:
final_res

Unnamed: 0,Medicine_ID,0
0,HLZ81,2008.029853
1,ECE94,4055.506813
2,SAD14,1589.413296
3,EQV63,498.403086
4,AIR10,394.210178
...,...,...
1700,KXW10,3314.665750
1701,CKE54,806.516933
1702,HAY13,3115.791218
1703,ZEE32,3476.129932


### Final Submission 2

In [51]:
pd.DataFrame(final_res).to_csv("mysubmissionrsdt02.csv",index=False)