In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from dmba import regressionSummary

from sklearn.neural_network import MLPRegressor 

no display found. Using non-interactive Agg backend


### Training

In [2]:
training_df = pd.read_csv('train.csv')
training_df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [3]:
for c in training_df.columns:
    print(c)
    print(training_df[c].describe())
    print()

User_ID
count    5.500680e+05
mean     1.003029e+06
std      1.727592e+03
min      1.000001e+06
25%      1.001516e+06
50%      1.003077e+06
75%      1.004478e+06
max      1.006040e+06
Name: User_ID, dtype: float64

Product_ID
count        550068
unique         3631
top       P00265242
freq           1880
Name: Product_ID, dtype: object

Gender
count     550068
unique         2
top            M
freq      414259
Name: Gender, dtype: object

Age
count     550068
unique         7
top        26-35
freq      219587
Name: Age, dtype: object

Occupation
count    550068.000000
mean          8.076707
std           6.522660
min           0.000000
25%           2.000000
50%           7.000000
75%          14.000000
max          20.000000
Name: Occupation, dtype: float64

City_Category
count     550068
unique         3
top            B
freq      231173
Name: City_Category, dtype: object

Stay_In_Current_City_Years
count     550068
unique         5
top            1
freq      193821
Name: Stay_In_Cur

In [4]:
training_df['Product_ID'] = training_df['Product_ID'].astype('category')
training_df['Gender'] = training_df['Gender'].astype('category')
training_df['Age'] = training_df['Age'].astype('category')
training_df['City_Category'] = training_df['City_Category'].astype('category')
training_df['Stay_In_Current_City_Years'] = training_df['Stay_In_Current_City_Years'].astype('category')

In [5]:
base_encoder = ce.BaseNEncoder(cols=['Product_ID','Occupation','Product_Category_1',
                                           'Product_Category_2','Product_Category_3']
                                     ,return_df=True,base=16)
training_df = base_encoder.fit_transform(training_df)

training_df = pd.get_dummies(training_df, columns=['Gender','Age','City_Category','Stay_In_Current_City_Years'],
                            prefix_sep='_', drop_first=True)

In [6]:
predicotrs = list(training_df.columns)
predicotrs.remove('Purchase')
predicotrs.remove('User_ID')
print('Predictors:')
print(predicotrs)
print()
print('Outcome:')
print('Purchase')

Predictors:
['Product_ID_0', 'Product_ID_1', 'Product_ID_2', 'Occupation_0', 'Occupation_1', 'Marital_Status', 'Product_Category_1_0', 'Product_Category_1_1', 'Product_Category_2_0', 'Product_Category_2_1', 'Product_Category_3_0', 'Product_Category_3_1', 'Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+', 'City_Category_B', 'City_Category_C', 'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2', 'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+']

Outcome:
Purchase


In [7]:
y = training_df['Purchase']
x = training_df[predicotrs]

In [8]:
x

Unnamed: 0,Product_ID_0,Product_ID_1,Product_ID_2,Occupation_0,Occupation_1,Marital_Status,Product_Category_1_0,Product_Category_1_1,Product_Category_2_0,Product_Category_2_1,...,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,0,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0,0,2,0,1,0,0,2,0,2,...,0,0,0,0,0,0,0,1,0,0
2,0,0,3,0,1,0,0,3,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,0,4,0,1,0,0,3,0,3,...,0,0,0,0,0,0,0,1,0,0
4,0,0,5,0,2,0,0,4,0,1,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550063,14,2,12,1,4,1,1,3,0,1,...,0,0,1,0,1,0,1,0,0,0
550064,14,2,11,0,7,0,1,3,0,1,...,0,0,0,0,0,1,0,0,1,0
550065,14,2,11,0,3,1,1,3,0,1,...,0,0,0,0,1,0,0,0,0,1
550066,14,2,11,0,7,0,1,3,0,1,...,0,0,0,1,0,1,0,1,0,0


In [9]:
y

0          8370
1         15200
2          1422
3          1057
4          7969
          ...  
550063      368
550064      371
550065      137
550066      365
550067      490
Name: Purchase, Length: 550068, dtype: int64

In [10]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.4, random_state=1)

#### Linear Regression

In [11]:
lr = LinearRegression()
lr.fit(x_train,y_train)

print('Linear Regression Model Parameters')
print()
print('Intercept: ',np.round(lr.intercept_, 2))
print()
print(
    pd.DataFrame({
        'Predictor': x.columns,
        'Coefficient': np.round(lr.coef_, 2)
    })
)

Linear Regression Model Parameters

Intercept:  8994.81

                        Predictor  Coefficient
0                    Product_ID_0      -341.68
1                    Product_ID_1        -6.37
2                    Product_ID_2       -42.26
3                    Occupation_0        18.29
4                    Occupation_1        -1.76
5                  Marital_Status       -60.21
6            Product_Category_1_0      3439.83
7            Product_Category_1_1      -148.59
8            Product_Category_2_0       234.87
9            Product_Category_2_1        87.31
10           Product_Category_3_0      4783.87
11           Product_Category_3_1       311.32
12                       Gender_M       539.41
13                      Age_18-25       320.15
14                      Age_26-35       499.44
15                      Age_36-45       565.81
16                      Age_46-50       539.61
17                      Age_51-55       840.75
18                        Age_55+       592.98
19 

In [12]:
lr_pred = lr.predict(x_valid)

print(round(pd.DataFrame({
    'Actual': y_valid,
    'Predicted': lr_pred,
    'Residual': y_valid-lr_pred
})))

        Actual  Predicted  Residual
470060    8013     9901.0   -1888.0
395974    2769    12112.0   -9343.0
14234     5952     8535.0   -2583.0
72183     9914     8575.0    1339.0
272536   15472    11364.0    4108.0
...        ...        ...       ...
100868    1042     6173.0   -5131.0
527216    4041     9045.0   -5004.0
223234    9925     8074.0    1851.0
178866   15682    12251.0    3431.0
13423     7100     8498.0   -1398.0

[220028 rows x 3 columns]


In [13]:
lr_pred_train = lr.predict(x_train)

print('Accuracy for Training Data')
print()
regressionSummary(y_train,lr_pred_train)

print('Accuracy for Testing Data')
print()
regressionSummary(y_valid,lr_pred)

Accuracy for Training Data


Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 4659.4065
            Mean Absolute Error (MAE) : 3624.8713
          Mean Percentage Error (MPE) : -144.4536
Mean Absolute Percentage Error (MAPE) : 167.6113
Accuracy for Testing Data


Regression statistics

                      Mean Error (ME) : 2.7776
       Root Mean Squared Error (RMSE) : 4663.9709
            Mean Absolute Error (MAE) : 3626.6151
          Mean Percentage Error (MPE) : -143.0260
Mean Absolute Percentage Error (MAPE) : 166.2080


#### Neural Network

In [14]:
nn = MLPRegressor(hidden_layer_sizes=(10), 
                solver='lbfgs', max_iter=10000, random_state=1)
nn.fit(x_train,y_train)

print('Model Intercept: ', nn.intercepts_)
print()
print('Model Coeficients: ', nn.coefs_)

Model Intercept:  [array([ 1.41657012e+01,  1.20815516e+02,  2.32667041e-01,  2.59952011e+02,
       -2.41999882e+01, -4.24281531e-01,  5.62441104e+01, -2.32258344e+01,
       -2.87885377e+01, -1.25198467e-01]), array([13.89344582])]

Model Coeficients:  [array([[-2.36743252e+01,  1.23451281e+02, -7.55831024e-01,
        -2.20756352e+00,  1.18275332e+02, -1.71322639e+00,
        -2.00337734e+01,  6.15917145e+00,  1.32651960e+01,
        -5.94266838e-01],
       [ 2.31070656e+01,  2.29903994e+01, -1.92070295e+00,
        -5.87064295e-01,  2.42744190e+01, -5.20341480e+00,
         1.56483905e+01, -6.85325433e+01, -7.94350944e+01,
        -1.28696917e+00],
       [-7.02999060e+00,  4.36573238e+01, -4.59627155e-01,
         1.15998258e+00,  1.89390197e+01, -6.16707555e+00,
         1.85346803e+01,  1.59293343e+00,  3.33315813e+00,
        -1.69191455e+00],
       [ 4.13884505e+00,  1.65507425e+01,  5.72099568e-01,
         6.23467505e-02,  5.17046041e-02, -2.81954329e-01,
         1.872569

In [15]:
nn_pred = nn.predict(x_valid)

print(round(pd.DataFrame({
    'Actual': y_valid,
    'Predicted': nn_pred,
    'Residual': y_valid-nn_pred
})))

        Actual  Predicted  Residual
470060    8013     8654.0    -641.0
395974    2769     8481.0   -5712.0
14234     5952     7174.0   -1222.0
72183     9914     7887.0    2027.0
272536   15472    11579.0    3893.0
...        ...        ...       ...
100868    1042     9736.0   -8694.0
527216    4041     8205.0   -4164.0
223234    9925     7669.0    2256.0
178866   15682    16065.0    -383.0
13423     7100     6437.0     663.0

[220028 rows x 3 columns]


In [16]:
nn_pred_train = nn.predict(x_train)

print('Accuracy for Training Data')
print()
regressionSummary(y_train,nn_pred_train)

print('Accuracy for Testing Data')
print()
regressionSummary(y_valid,nn_pred)

Accuracy for Training Data


Regression statistics

                      Mean Error (ME) : 0.1410
       Root Mean Squared Error (RMSE) : 3902.8443
            Mean Absolute Error (MAE) : 2962.6778
          Mean Percentage Error (MPE) : -31.0287
Mean Absolute Percentage Error (MAPE) : 79.5250
Accuracy for Testing Data


Regression statistics

                      Mean Error (ME) : 5.6510
       Root Mean Squared Error (RMSE) : 3901.5870
            Mean Absolute Error (MAE) : 2958.2272
          Mean Percentage Error (MPE) : -31.2614
Mean Absolute Percentage Error (MAPE) : 79.3342


### Testing 

In [17]:
testing_df = pd.read_csv('test.csv')
testing_df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [18]:
for c in testing_df.columns:
    print(c)
    print(testing_df[c].describe())
    print()

User_ID
count    2.335990e+05
mean     1.003029e+06
std      1.726505e+03
min      1.000001e+06
25%      1.001527e+06
50%      1.003070e+06
75%      1.004477e+06
max      1.006040e+06
Name: User_ID, dtype: float64

Product_ID
count        233599
unique         3491
top       P00265242
freq            829
Name: Product_ID, dtype: object

Gender
count     233599
unique         2
top            M
freq      175772
Name: Gender, dtype: object

Age
count     233599
unique         7
top        26-35
freq       93428
Name: Age, dtype: object

Occupation
count    233599.000000
mean          8.085407
std           6.521146
min           0.000000
25%           2.000000
50%           7.000000
75%          14.000000
max          20.000000
Name: Occupation, dtype: float64

City_Category
count     233599
unique         3
top            B
freq       98566
Name: City_Category, dtype: object

Stay_In_Current_City_Years
count     233599
unique         5
top            1
freq       82604
Name: Stay_In_Cur

In [19]:
testing_df['Product_ID'] = testing_df['Product_ID'].astype('category')
testing_df['Gender'] = testing_df['Gender'].astype('category')
testing_df['Age'] = testing_df['Age'].astype('category')
testing_df['City_Category'] = testing_df['City_Category'].astype('category')
testing_df['Stay_In_Current_City_Years'] = testing_df['Stay_In_Current_City_Years'].astype('category')

In [20]:
base_encoder_test = ce.BaseNEncoder(cols=['Product_ID','Occupation','Product_Category_1',
                                           'Product_Category_2','Product_Category_3']
                                     ,return_df=True,base=16)
testing_df_encoded = base_encoder.fit_transform(testing_df)

testing_df_encoded = pd.get_dummies(testing_df_encoded, columns=['Gender','Age','City_Category','Stay_In_Current_City_Years'],
                            prefix_sep='_', drop_first=True)

In [21]:
test_predictors = list(testing_df_encoded.columns)
test_predictors.remove('User_ID')
test_x = testing_df_encoded[test_predictors]
test_x

Unnamed: 0,Product_ID_0,Product_ID_1,Product_ID_2,Occupation_0,Occupation_1,Marital_Status,Product_Category_1_0,Product_Category_1_1,Product_Category_2_0,Product_Category_2_1,...,Age_36-45,Age_46-50,Age_51-55,Age_55+,City_Category_B,City_Category_C,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,0,0,1,0,1,1,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
1,0,0,2,0,2,0,0,2,0,2,...,0,0,0,0,0,1,0,0,0,0
2,0,0,3,0,3,1,0,3,0,3,...,1,0,0,0,1,0,0,0,0,1
3,0,0,4,0,3,1,0,4,0,4,...,1,0,0,0,1,0,0,0,0,1
4,0,0,5,0,3,0,0,4,0,2,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233594,9,4,11,0,4,1,0,9,0,9,...,0,0,0,0,1,0,0,0,0,1
233595,5,12,14,0,4,1,0,3,0,10,...,0,0,0,0,1,0,0,0,0,1
233596,2,5,11,0,4,1,0,1,0,2,...,0,0,0,0,1,0,0,0,0,1
233597,1,4,14,0,3,0,0,6,0,14,...,0,1,0,0,0,1,0,0,0,1


#### Linear Regression

In [22]:
lr_purchase_prediction = lr.predict(test_x)

output_df = pd.DataFrame({
    'Purchase': lr_purchase_prediction,
    'User_ID': testing_df['User_ID'],
    'Product_ID': testing_df['Product_ID']
})

output_df

Unnamed: 0,Purchase,User_ID,Product_ID
0,10385.446164,1000004,P00128942
1,10624.184124,1000009,P00113442
2,9662.691230,1000010,P00288442
3,9559.145955,1000010,P00145342
4,10015.903399,1000011,P00053842
...,...,...,...
233594,5788.203382,1006036,P00118942
233595,7956.006075,1006036,P00254642
233596,9062.447991,1006036,P00031842
233597,9758.900753,1006037,P00124742


#### Neural Network

In [23]:
nn_purchase_prediction = nn.predict(test_x)

output_df = pd.DataFrame({
    'Purchase': nn_purchase_prediction,
    'User_ID': testing_df['User_ID'],
    'Product_ID': testing_df['Product_ID']
})

output_df

Unnamed: 0,Purchase,User_ID,Product_ID
0,15164.324703,1000004,P00128942
1,13801.254204,1000009,P00113442
2,9133.143964,1000010,P00288442
3,6176.948531,1000010,P00145342
4,8414.809033,1000011,P00053842
...,...,...,...
233594,7933.403551,1006036,P00118942
233595,7095.612506,1006036,P00254642
233596,15548.969250,1006036,P00031842
233597,8864.418408,1006037,P00124742


In [24]:
output_df.to_csv('purchase_value_prediction_nn.csv',encoding='utf-8')