## A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.

## Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

### importing libraries 

In [1]:
import numpy as np 
import pandas as pd
import category_encoders as ce


### Loading data into df and exploring data 

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
df.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  object 
 3   Age                         550068 non-null  object 
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  object 
 6   Stay_In_Current_City_Years  550068 non-null  object 
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          376430 non-null  float64
 10  Product_Category_3          166821 non-null  float64
 11  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 50.4+ MB


In [6]:
for c in df.columns:
    print(c)
    print(df[c].describe())
    print()

User_ID
count    5.500680e+05
mean     1.003029e+06
std      1.727592e+03
min      1.000001e+06
25%      1.001516e+06
50%      1.003077e+06
75%      1.004478e+06
max      1.006040e+06
Name: User_ID, dtype: float64

Product_ID
count        550068
unique         3631
top       P00265242
freq           1880
Name: Product_ID, dtype: object

Gender
count     550068
unique         2
top            M
freq      414259
Name: Gender, dtype: object

Age
count     550068
unique         7
top        26-35
freq      219587
Name: Age, dtype: object

Occupation
count    550068.000000
mean          8.076707
std           6.522660
min           0.000000
25%           2.000000
50%           7.000000
75%          14.000000
max          20.000000
Name: Occupation, dtype: float64

City_Category
count     550068
unique         3
top            B
freq      231173
Name: City_Category, dtype: object

Stay_In_Current_City_Years
count     550068
unique         5
top            1
freq      193821
Name: Stay_In_Cur

In [7]:
df.columns

Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

### converting columns data type into category

In [8]:
df.Gender = df.Gender.astype('category')
df.Occupation = df.Occupation.astype('category')
df.City_Category = df.City_Category.astype('category')
df.Stay_In_Current_City_Years = df.Stay_In_Current_City_Years.astype('category')
df.Marital_Status = df.Marital_Status.astype('category')
df.Age = df.Age.astype('category')


### using base n ecoder to encode the occupation and product_category_1

In [9]:
encoder1 = ce.BaseNEncoder(cols=['Occupation','Product_Category_1']
                          ,return_df=True,base=5)
df = encoder1.fit_transform(df)

In [10]:
df

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation_0,Occupation_1,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1_0,Product_Category_1_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,0,1,A,2,0,0,1,,,8370
1,1000001,P00248942,F,0-17,0,1,A,2,0,0,2,6.0,14.0,15200
2,1000001,P00087842,F,0-17,0,1,A,2,0,0,3,,,1422
3,1000001,P00085442,F,0-17,0,1,A,2,0,0,3,14.0,,1057
4,1000002,P00285442,M,55+,0,2,C,4+,0,0,4,,,7969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,1006033,P00372445,M,51-55,4,0,B,1,1,3,4,,,368
550064,1006035,P00375436,F,26-35,1,2,C,3,0,3,4,,,371
550065,1006036,P00375436,F,26-35,0,3,B,4+,1,3,4,,,137
550066,1006038,P00375436,F,55+,1,2,C,2,0,3,4,,,365


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 14 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   User_ID                     550068 non-null  int64   
 1   Product_ID                  550068 non-null  object  
 2   Gender                      550068 non-null  category
 3   Age                         550068 non-null  category
 4   Occupation_0                550068 non-null  int64   
 5   Occupation_1                550068 non-null  int64   
 6   City_Category               550068 non-null  category
 7   Stay_In_Current_City_Years  550068 non-null  category
 8   Marital_Status              550068 non-null  category
 9   Product_Category_1_0        550068 non-null  int64   
 10  Product_Category_1_1        550068 non-null  int64   
 11  Product_Category_2          376430 non-null  float64 
 12  Product_Category_3          166821 non-null  float64 
 13 

### creating dummy columns for all the categorical columns 

In [12]:
df = pd.get_dummies(df, columns=['Gender','Marital_Status','Age','City_Category','Stay_In_Current_City_Years'], prefix_sep='_',drop_first=True)

In [13]:
df.columns

Index(['User_ID', 'Product_ID', 'Occupation_0', 'Occupation_1',
       'Product_Category_1_0', 'Product_Category_1_1', 'Product_Category_2',
       'Product_Category_3', 'Purchase', 'Gender_M', 'Marital_Status_1',
       'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55',
       'Age_55+', 'City_Category_B', 'City_Category_C',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+'],
      dtype='object')

In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neural_network import MLPClassifier, MLPRegressor 
from sklearn.preprocessing import StandardScaler

from mord import LogisticIT

from dmba import classificationSummary, regressionSummary

In [15]:
predictors = ['Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55',
       'Age_55+', 'Occupation_0',
       'Occupation_1', 'Marital_Status_1', 'Product_Category_1_0',
       'Product_Category_1_1', 
        'City_Category_B', 'City_Category_C',
       'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
       'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+']
outcome = 'Purchase'


X = df[predictors]
y = df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, 
                            test_size=0.4, random_state=1)



In [16]:
train_X

Unnamed: 0,Gender_M,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,Occupation_0,Occupation_1,Marital_Status_1,Product_Category_1_0,Product_Category_1_1,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
77070,1,0,1,0,0,0,0,2,2,0,0,2,1,0,1,0,0,0
522751,1,0,1,0,0,0,0,0,4,1,0,2,0,0,0,1,0,0
63929,1,0,0,0,0,1,0,0,4,0,0,2,0,1,1,0,0,0
428429,1,1,0,0,0,0,0,3,3,0,0,4,1,0,1,0,0,0
309394,0,0,1,0,0,0,0,0,4,0,2,4,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371403,0,0,1,0,0,0,0,0,4,0,0,4,1,0,0,0,1,0
491263,0,0,1,0,0,0,0,1,2,0,0,2,0,1,1,0,0,0
470924,0,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,1
491755,1,0,1,0,0,0,0,2,0,0,1,0,0,1,1,0,0,0


### creating neural network model

In [17]:
black_reg = MLPRegressor(hidden_layer_sizes=(9), 
                solver='lbfgs', max_iter=10000, random_state=1)
black_reg.fit(train_X, train_y)

In [18]:
price_pred = np.round(black_reg.predict(valid_X), decimals=2)

# Create data frame to display prediction results for
# validation set. 
price_pred_result = pd.DataFrame({'Actual': valid_y, 
                'Prediction': price_pred, 'Residual': valid_y-price_pred})

print('Predictions for Toyota Price for Validation Partition')
print(price_pred_result.head(10))

Predictions for Toyota Price for Validation Partition
        Actual  Prediction  Residual
470060    8013    11960.53  -3947.53
395974    2769     8444.87  -5675.87
14234     5952    10045.57  -4093.57
72183     9914    11439.60  -1525.60
272536   15472     9173.19   6298.81
16702     6943     7590.66   -647.66
452591    5215     7510.62  -2295.62
183076   11734     9767.57   1966.43
385861   15551    10171.94   5379.06
215795   13406    10099.68   3306.32


### for test data


In [20]:
test =  pd.read_csv('test.csv')
test

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0
...,...,...,...,...,...,...,...,...,...,...,...
233594,1006036,P00118942,F,26-35,15,B,4+,1,8,,
233595,1006036,P00254642,F,26-35,15,B,4+,1,5,8.0,
233596,1006036,P00031842,F,26-35,15,B,4+,1,1,5.0,12.0
233597,1006037,P00124742,F,46-50,1,C,4+,0,10,16.0,


In [21]:
test.Gender = test.Gender.astype('category')
test.Occupation = test.Occupation.astype('category')
test.City_Category = test.City_Category.astype('category')
test.Stay_In_Current_City_Years = test.Stay_In_Current_City_Years.astype('category')
test.Marital_Status = test.Marital_Status.astype('category')
test.Age = test.Age.astype('category')

In [22]:
test = encoder1.fit_transform(test)

In [23]:
test = pd.get_dummies(test, columns=['Gender','Marital_Status','Age','City_Category','Stay_In_Current_City_Years'], prefix_sep='_',drop_first=True)

In [24]:
test.columns

Index(['User_ID', 'Product_ID', 'Occupation_0', 'Occupation_1',
       'Product_Category_1_0', 'Product_Category_1_1', 'Product_Category_2',
       'Product_Category_3', 'Gender_M', 'Marital_Status_1', 'Age_18-25',
       'Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+',
       'City_Category_B', 'City_Category_C', 'Stay_In_Current_City_Years_1',
       'Stay_In_Current_City_Years_2', 'Stay_In_Current_City_Years_3',
       'Stay_In_Current_City_Years_4+'],
      dtype='object')

In [25]:
out =  black_reg.predict(test[predictors])


In [26]:
out

array([ 9062.55686395, 10519.80992817,  9954.81366606, ...,
        8781.70721056,  8372.72412478, 10261.61003819])

In [27]:
output = pd.DataFrame()
output['Purchase'] = out
output['User_ID'] = test['User_ID']
output['Product_ID'] = test['Product_ID']

In [28]:
output.reset_index ()
output.to_csv("output1.csv")

In [29]:
output

Unnamed: 0,Purchase,User_ID,Product_ID
0,9062.556864,1000004,P00128942
1,10519.809928,1000009,P00113442
2,9954.813666,1000010,P00288442
3,10598.880556,1000010,P00145342
4,11271.195085,1000011,P00053842
...,...,...,...
233594,9918.156718,1006036,P00118942
233595,10069.840990,1006036,P00254642
233596,8781.707211,1006036,P00031842
233597,8372.724125,1006037,P00124742
