# Counterfeit Medicines Sales Prediction
 

## Task
- to build predictive model for predicting sales figures given other information
  related to counterfeit medicine selling operations.

In [58]:
# Importing Libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model

##   Read the .csv and load into DataFrame

In [6]:
df = pd.read_csv('counterfeit_train.csv')
df.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [7]:
df.shape

(6818, 12)

## Calculating the null values.

In [8]:
df.isna().sum()

Medicine_ID               0
Counterfeit_Weight     1166
DistArea_ID               0
Active_Since              0
Medicine_MRP              0
Medicine_Type             0
SidEffect_Level           0
Availability_rating       0
Area_Type                 0
Area_City_Type            0
Area_dist_level           0
Counterfeit_Sales         0
dtype: int64

## Getting the total values

In [9]:
df.isna().sum().sum()

1166

##  Getting the null values in percentage

In [67]:
df_missing=pd.DataFrame((round(100*(df.isnull().sum()/len(df.index)), 2)), columns=['missing']) # It ill tell null values in percentage.
df_missing.sort_values(by=['missing'], ascending=False).head(20)

Unnamed: 0,missing
Medicine_ID,0.0
Counterfeit_Weight,0.0
DistArea_ID,0.0
Active_Since,0.0
Medicine_MRP,0.0
Medicine_Type,0.0
SidEffect_Level,0.0
Availability_rating,0.0
Area_Type,0.0
Area_City_Type,0.0


##  Filing the Missing Values

In [13]:
df['Counterfeit_Weight'] = df['Counterfeit_Weight'].fillna('None')

In [14]:
df.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [15]:
df.isna().sum().sum()

0

In [19]:
df.nunique()

Medicine_ID            1557
Counterfeit_Weight      415
DistArea_ID              10
Active_Since              9
Medicine_MRP           5097
Medicine_Type            16
SidEffect_Level           2
Availability_rating    6317
Area_Type                 4
Area_City_Type            3
Area_dist_level           4
Counterfeit_Sales      3142
dtype: int64

In [21]:
df.head()


Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [22]:
df['Medicine_Type'].value_counts()

Hreplacements         976
Antibiotics           970
Antiseptics           724
OralContraceptives    694
Antipyretics          536
Cardiac               522
Mstablizers           514
Tranquilizers         420
Analgesics            356
Antimalarial          339
Antacids              206
Statins               167
MuscleRelaxants       139
Antifungal            111
Stimulants             95
Antiviral              49
Name: Medicine_Type, dtype: int64

In [23]:
df.describe()

Unnamed: 0,Active_Since,Medicine_MRP,Availability_rating,Counterfeit_Sales
count,6818.0,6818.0,6818.0,6818.0
mean,1995.836316,151.401518,0.079174,2280.58348
std,8.368979,62.203961,0.051481,1693.354404
min,1983.0,41.79,0.013,146.29
25%,1985.0,104.5094,0.040058,933.2656
50%,1997.0,153.1957,0.066955,1902.6704
75%,2002.0,196.14835,0.107697,3207.6384
max,2007.0,277.1884,0.341391,13199.9648


In [26]:
min(df['Counterfeit_Sales'])

146.29

In [27]:
df.columns

Index(['Medicine_ID', 'Counterfeit_Weight', 'DistArea_ID', 'Active_Since',
       'Medicine_MRP', 'Medicine_Type', 'SidEffect_Level',
       'Availability_rating', 'Area_Type', 'Area_City_Type', 'Area_dist_level',
       'Counterfeit_Sales'],
      dtype='object')

# Taking the Significant Data Columns 

In [29]:
sig_df=df[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]
sig_df.head()

Unnamed: 0,Active_Since,Medicine_MRP,Availability_rating,Counterfeit_Sales
0,1995,160.2366,0.070422,1775.5026
1,1983,110.4384,0.013,3069.152
2,1995,259.4092,0.060783,2603.092
3,1995,99.983,0.065555,1101.713
4,1983,56.4402,0.248859,158.9402


## Get scatter plot between dependent and independent variables 

In [60]:

for i in sig_df[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]:
    plt.scatter(sig_df[i], sig_df['Counterfeit_Sales'], color='blue')
    plt.xlabel(i)
    plt.ylabel("Sales")
    

## Plotting Histogram 

In [61]:
viz = sig_df[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]
viz.hist()
plt.show()

##  Create Train and Test dataset

In [None]:
mask=np.random.rand(len(df)) < 0.80                  
train=sig_df[mask]
test=sig_df[~mask]

##  Checking Training data pattern

In [51]:
for i in train[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]:
    plt.scatter(train[i], train['Counterfeit_Sales'], color='blue')
    plt.xlabel(i)
    plt.ylabel("Sales")
    plt.show()

# Applying Linear Regression 

In [53]:
coefficient=[]
intercept=[]
regress_model={}
for i in train[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]:
    regr=linear_model.LinearRegression()
    train_x = np.asanyarray(train[[i]])
    train_y=  np.asanyarray(train[['Counterfeit_Sales']])
    regr.fit(train_x, train_y)
    regress_model[i]=regr
    print("Relation between {} & {}".format(i,"'Counterfeit_Sales'"))
    print ('Coefficients: ', regr.coef_)
    print ('Intercept: ',regr.intercept_)
    coefficient.append(regr.coef_)
    intercept.append(regr.intercept_)
    
    print('\n')

Relation between Active_Since & 'Counterfeit_Sales'
Coefficients:  [[-8.54941435]]
Intercept:  [19335.89051924]


Relation between Medicine_MRP & 'Counterfeit_Sales'
Coefficients:  [[15.44985306]]
Intercept:  [-66.31028873]


Relation between Availability_rating & 'Counterfeit_Sales'
Coefficients:  [[-4004.75294917]]
Intercept:  [2590.07589754]


Relation between Counterfeit_Sales & 'Counterfeit_Sales'
Coefficients:  [[1.]]
Intercept:  [2.72848411e-12]




### Use regression parameter to model linear equation 

In [57]:
j=0
for i in train[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]:
    plt.scatter(train[i], train['Counterfeit_Sales'],  color='blue')
    x=train[i].values
    print(x.shape)
    y=coefficient[j][0]*x + intercept[j]
    print(y.shape)
    l=len(y)
    y=np.reshape(y,(l,))
    plt.plot(x, y, '-r')
    plt.xlabel(i)
    plt.ylabel("Sales")
    plt.show()
    j=j+1;

(5457,)
(5457,)
(5457,)
(5457,)
(5457,)
(5457,)
(5457,)
(5457,)


##  Evaluate model performance on test data 

In [65]:
from sklearn.metrics import r2_score

for i in train[['Active_Since','Medicine_MRP','Availability_rating','Counterfeit_Sales']]:
    test_x = np.asanyarray(test[[i]])
    test_y = np.asanyarray(test[['Counterfeit_Sales']])
    test_y_ = regress_model[i].predict(test_x)
    print("Fitting Error between {} & {}".format(i,"'Sales'"))
    print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
    print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y) ** 2))
    print("R2-score: %.2f" % r2_score(test_y_ , test_y) )
    print('\n')

Fitting Error between Active_Since & 'Sales'
Mean absolute error: 1347.37
Residual sum of squares (MSE): 2958976.05
R2-score: -565.76


Fitting Error between Medicine_MRP & 'Sales'
Mean absolute error: 1034.21
Residual sum of squares (MSE): 1949366.22
R2-score: -1.14


Fitting Error between Availability_rating & 'Sales'
Mean absolute error: 1331.87
Residual sum of squares (MSE): 2909832.88
R2-score: -66.60


Fitting Error between Counterfeit_Sales & 'Sales'
Mean absolute error: 0.00
Residual sum of squares (MSE): 0.00
R2-score: 1.00


