# Exploratory Data Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = "./DataSets/"
training_data = pd.read_csv(file_path + 'counterfeit_train.csv')
production_data = pd.read_csv(file_path + 'counterfeit_test.csv')

In [None]:
training_data.head()

In [None]:
production_data.head()

In [None]:
training_data['Counterfeit_Weight'].mean()

In [None]:
training_data.info()

In [None]:
production_data.info()

Creating a Dummy sales variable in the production data and combining it with training data for further data manipulations

In [None]:
production_data['Counterfeit_Sales'] = np.NAN

In [None]:
training_data['Data_from'] = 'Train'
production_data['Data_from'] = 'Production'

In [None]:
full_data = pd.concat([training_data,production_data],axis=0)

In [None]:
full_data.loc[full_data['Data_from'] == 'Train',]

In [None]:
full_data['DistArea_ID'].value_counts()

In [None]:
full_data['Area_Type'].value_counts()

In [None]:
full_data.loc[full_data['Area_Type']=='DownTown','DistArea_ID'].unique()

In [None]:
full_data.loc[full_data['Area_Type']=='MidTownResidential','DistArea_ID'].unique()

In [None]:
full_data.loc[full_data['Area_Type']=='CityLimits','DistArea_ID'].unique()

In [None]:
full_data.loc[full_data['Area_Type']=='Industrial','DistArea_ID'].unique()

In [None]:
full_data.head()

In [None]:
full_data['Medicine_Type'].value_counts()

In [None]:
round(full_data.groupby('Medicine_Type')['Counterfeit_Sales'].mean(),2)

In [None]:
full_data['SidEffect_Level'].value_counts()

In [None]:
full_data['Area_dist_level'].value_counts()

#### After looking into all the categorical  variables, we cannot drop them and we create dummies for each variable for simplicity later we can ckeck for additional options to improve the performance

In [None]:
full_data.head(10)

In [None]:
d = pd.get_dummies(full_data['DistArea_ID'],prefix='DistID',drop_first=True)

In [None]:
full_data = pd.concat((full_data,d),axis='columns')

In [None]:
full_data.drop('DistArea_ID',axis='columns',inplace=True)

In [None]:
full_data

##### Medicine variable - Dummy Creations

In [None]:
d = pd.get_dummies(full_data['Medicine_Type'],prefix='MedType',drop_first=True)
full_data = pd.concat((full_data,d),axis='columns')
full_data.drop('Medicine_Type',axis='columns',inplace=True,)

In [None]:
full_data

##### SidEffect_Level - Dummy Creation

In [None]:
full_data['SidEffect_Level'].value_counts()

In [None]:
d = pd.get_dummies(full_data['SidEffect_Level'],prefix='SidLvl',drop_first=True)
full_data = pd.concat((full_data,d),axis='columns')
full_data.drop('SidEffect_Level',axis='columns',inplace = True)

In [None]:
full_data

##### Area_Type - Dummy Creation

In [None]:
d = pd.get_dummies(full_data['Area_Type'],prefix='ArTyp',drop_first=True)
full_data = pd.concat((full_data,d),axis='columns')
full_data.drop('Area_Type',axis='columns',inplace = True)

In [None]:
full_data

##### Area_dist_level - Dummy Creation

In [None]:
d = pd.get_dummies(full_data['Area_dist_level'],prefix='ArDisTyp',drop_first=True)
full_data = pd.concat((full_data,d),axis='columns')
full_data.drop('Area_dist_level',axis='columns',inplace = True)

In [None]:
full_data

##### Area_City_Type 	- Dummy Creation

In [None]:
d = pd.get_dummies(full_data['Area_City_Type'],prefix='ArCtyTyp',drop_first=True)
full_data = pd.concat((full_data,d),axis='columns')
full_data.drop('Area_City_Type',axis='columns',inplace = True)

In [None]:
full_data

##### Dropping Medicine ID

In [None]:
full_data.drop('Medicine_ID',axis='columns',inplace=True)

In [None]:
full_data

##### Imputing Counterfeit_weight missing values with mean from train dataset only

In [None]:
wgt_mean = round(full_data.loc[full_data['Data_from'] == 'Train','Counterfeit_Weight'].mean(),3)


In [None]:
wgt_mean

In [None]:
full_data['Counterfeit_Weight'].fillna(wgt_mean,inplace=True)

In [None]:
full_data.info()

##### Now Saeperating data into Training and Production

In [None]:
training_data = full_data.loc[full_data['Data_from'] == 'Train']
training_data.drop('Data_from',axis=1,inplace=True)

In [None]:
training_data

In [None]:
production_data = full_data.loc[full_data['Data_from'] == 'Production']

In [None]:
production_data.drop(['Counterfeit_Sales','Data_from'],axis=True,inplace=True)
production_data

# Splitting training data into Train and Test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train,test = train_test_split(training_data,test_size=0.2,random_state = 2,)

In [None]:
x_train = train.drop('Counterfeit_Sales',axis='columns')
y_train = train['Counterfeit_Sales']

x_test = test.drop('Counterfeit_Sales',axis='columns')
y_test = test['Counterfeit_Sales']

print('x_train : ',x_train.shape)
print('y_train : ',y_train.shape)

print('x_test : ',x_test.shape)
print('y_test : ',y_test.shape)

# Linear Regression Model

In [54]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def my_rmse(actual,predicted):
    """Function to calculate Root Mean Square Error"""
    
    return (((actual-predicted)**2).mean())**1/2

#### Decision tree with CV to get best max_depth

In [55]:
dtree = DecisionTreeRegressor(random_state=2,criterion='mae')

In [71]:
params = {'max_depth' : [2,3,4,5,6,7,8,9,10,11,12,14,15,16,17,18,19,None]}
iterations = 19

In [72]:
dtree_random = RandomizedSearchCV(dtree,param_distributions=params,n_jobs=-1,n_iter=iterations,cv=5,
                                  random_state=5,scoring='r2')

In [73]:
dtree_random.fit(x_train,y_train)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=DecisionTreeRegressor(criterion='mae',
                                                   max_depth=None,
                                                   max_features=None,
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   presort=False,
                                                   random_state=2,
                                                   splitter='best'),
                   iid='warn', n_iter=19, n_jobs=-1,
                   param_distrib

In [79]:
dtree_random.best_params_

{'max_depth': 5}

In [80]:
dtree_random.best_score_

0.5944984998612384

In [81]:
dtree_random.cv_results_

{'mean_fit_time': array([1.54804759, 1.94858141, 2.18471179, 2.33722878, 2.43084264,
        2.49856482, 2.55130434, 2.5975071 , 2.6603066 , 2.67155132,
        2.68103952, 2.73235188, 2.7475162 , 2.76424885, 2.84292932,
        2.96497035, 2.77335796, 2.16831226]),
 'std_fit_time': array([0.08482219, 0.08364797, 0.07906525, 0.08290506, 0.10522364,
        0.08188334, 0.08185963, 0.08616665, 0.08960411, 0.08208941,
        0.09846132, 0.06781942, 0.08163431, 0.08836984, 0.10294583,
        0.09051225, 0.29647052, 0.14548796]),
 'mean_score_time': array([0.00148177, 0.00148759, 0.00148911, 0.00150423, 0.001511  ,
        0.00151343, 0.00151143, 0.00154176, 0.00154262, 0.00157242,
        0.00157733, 0.00168409, 0.00160875, 0.00166197, 0.00171785,
        0.00176802, 0.00155592, 0.00141835]),
 'std_score_time': array([3.41579861e-05, 4.44463903e-05, 2.16724456e-05, 3.03773340e-05,
        3.21561529e-05, 1.55440137e-05, 2.66327960e-05, 1.80969596e-05,
        1.30346732e-05, 3.05984588e-

In [82]:
dtree_random.best_estimator_

DecisionTreeRegressor(criterion='mae', max_depth=5, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=2, splitter='best')

In [83]:
dtree = dtree_random.best_estimator_
dtree.fit(x_train,y_train)
pred = dtree.predict(x_test)

In [84]:
print('Mean Absolute Error = ', metrics.median_absolute_error(y_test,pred))
print('Root Mean Square Error = ',my_rmse(y_test,pred))
print("Our model submission performance : ",1-(metrics.median_absolute_error(y_test,pred)/1600))

Mean Absolute Error =  488.0314000000002
Root Mean Square Error =  625993.4594418913
Our model submission performance :  0.6949803749999999


In [87]:
print("performance on train data")
pred_train = dtree.predict(x_train)
print('Mean Absolute Error = ', metrics.median_absolute_error(y_train,pred_train))
print('Root Mean Square Error = ',my_rmse(y_train,pred_train))
print("Our model submission performance : ",1-(metrics.median_absolute_error(y_train,pred_train)/1600))

performance on train data
Mean Absolute Error =  493.3578
Root Mean Square Error =  561513.4545339242
Our model submission performance :  0.691651375


#### Decision trees with max depth and min features CV

In [89]:
params = {
    'max_depth' : [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,None],
    'max_features' : list(range(3,38))
}

In [90]:
dtree = DecisionTreeRegressor(criterion='mae',random_state=2)

In [91]:
from sklearn.model_selection import GridSearchCV

In [92]:
dtree_random = GridSearchCV(dtree,scoring='r2',n_jobs=-1,cv=5,param_grid=params)

In [93]:
dtree_random.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mae', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=2,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17, 18, 19, None],
                         'max_features': [3, 4, 5, 6, 7, 8, 9, 10, 1

In [94]:
dtree_random.best_estimator_

DecisionTreeRegressor(criterion='mae', max_depth=5, max_features=36,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=2, splitter='best')

In [95]:
dtree_random.best_score_

0.5945041075157999

In [96]:
dtree_random.best_params_

{'max_depth': 5, 'max_features': 36}

In [97]:
dtree = dtree_random.best_estimator_
dtree.fit(x_train,y_train)
pred = dtree.predict(x_test)
print('Mean Absolute Error = ', metrics.median_absolute_error(y_test,pred))
print('Root Mean Square Error = ',my_rmse(y_test,pred))
print("Our model submission performance : ",1-(metrics.median_absolute_error(y_test,pred)/1600))

Mean Absolute Error =  490.6946000000001
Root Mean Square Error =  625577.2205853199
Our model submission performance :  0.6933158749999999


Even we tried for max_features and max_Depth combination MAE is not reduced its better that we go with the Random forest

##### Building model on full training data and submission

In [None]:
x_training = training_data.drop('Counterfeit_Sales',axis='columns')
y_training = training_data['Counterfeit_Sales']

In [None]:
lr.fit(x_training,y_training)
pred = lr.predict(production_data)

In [None]:
production_medID = pd.read_csv(file_path + 'counterfeit_test.csv')

In [None]:
production_medID = production_medID['Medicine_ID']

In [None]:
submission_df = pd.DataFrame({"Medicine_ID": production_medID,
                 'Counterfeit_Sales': pred})

In [None]:
submission_df.to_csv('part2 project3.csv',index=False)

# Conclusion

### Linear Regression

Mean Absolute Error =  615.9795920660683

Root Mean Square Error =  641656.7346209903

Our model submission performance :  0.6150127549587072

### Decision Tree

Mean Absolute Error =  488.0314000000002

Root Mean Square Error =  625993.4594418913

Our model submission performance :  0.6949803749999999
