In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
os.chdir(r'C:\Users\91938\Desktop\Preprocessor')

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.shape

(550068, 12)

In [5]:
test = pd.read_csv('test.csv')

In [6]:
test.shape

(233599, 11)

In [7]:
sample = pd.read_csv('sample_submission.csv')

In [8]:
sample.head()

Unnamed: 0,Purchase,User_ID,Product_ID
0,100,1000004,P00128942
1,100,1000009,P00113442
2,100,1000010,P00288442
3,100,1000010,P00145342
4,100,1000011,P00053842


In [9]:
train.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
User_ID,550068.0,,,,1003028.842401,1727.591586,1000001.0,1001516.0,1003077.0,1004478.0,1006040.0
Product_ID,550068.0,3631.0,P00265242,1880.0,,,,,,,
Gender,550068.0,2.0,M,414259.0,,,,,,,
Age,550068.0,7.0,26-35,219587.0,,,,,,,
Occupation,550068.0,,,,8.076707,6.52266,0.0,2.0,7.0,14.0,20.0
City_Category,550068.0,3.0,B,231173.0,,,,,,,
Stay_In_Current_City_Years,550068.0,5.0,1,193821.0,,,,,,,
Marital_Status,550068.0,,,,0.409653,0.49177,0.0,0.0,0.0,1.0,1.0
Product_Category_1,550068.0,,,,5.40427,3.936211,1.0,1.0,5.0,8.0,20.0
Product_Category_2,376430.0,,,,9.842329,5.08659,2.0,5.0,9.0,15.0,18.0


In [10]:
train['source'] = 'train'
test['source'] = 'test'

In [11]:
data = pd.concat([train, test])

In [12]:
data.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,source
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370.0,train
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200.0,train
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422.0,train
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057.0,train
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969.0,train
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227.0,train
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215.0,train
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854.0,train
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686.0,train
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871.0,train


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     783667 non-null  int64  
 1   Product_ID                  783667 non-null  object 
 2   Gender                      783667 non-null  object 
 3   Age                         783667 non-null  object 
 4   Occupation                  783667 non-null  int64  
 5   City_Category               783667 non-null  object 
 6   Stay_In_Current_City_Years  783667 non-null  object 
 7   Marital_Status              783667 non-null  int64  
 8   Product_Category_1          783667 non-null  int64  
 9   Product_Category_2          537685 non-null  float64
 10  Product_Category_3          237858 non-null  float64
 11  Purchase                    550068 non-null  float64
 12  source                      783667 non-null  object 
dtypes: float64(3),

In [14]:
data.isna().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            245982
Product_Category_3            545809
Purchase                      233599
source                             0
dtype: int64

In [15]:
# Gender = labelencode
# Age = onehot
# Occupation = 
# City_Category = onehot
# Stay_In_Current_City_Years = onehot
# Marital_Status = 
# Product_Category_1 = 
# Product_Category_2 = null values
# Product_Category_3 = null values
# purchase = null values


In [16]:
Encoder = LabelEncoder()

In [17]:
data['Gender'] = Encoder.fit_transform(data['Gender'])

In [18]:
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,source
0,1000001,P00069042,0,0-17,10,A,2,0,3,,,8370.0,train
1,1000001,P00248942,0,0-17,10,A,2,0,1,6.0,14.0,15200.0,train
2,1000001,P00087842,0,0-17,10,A,2,0,12,,,1422.0,train
3,1000001,P00085442,0,0-17,10,A,2,0,12,14.0,,1057.0,train
4,1000002,P00285442,1,55+,16,C,4+,0,8,,,7969.0,train


In [19]:
data_one_hot = pd.get_dummies(data, columns = ['Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status','Product_Category_1', 'Product_Category_2', 'Product_Category_3'])

In [20]:
data_one_hot.head()

Unnamed: 0,User_ID,Product_ID,Gender,Purchase,source,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000001,P00069042,0,8370.0,train,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000001,P00248942,0,15200.0,train,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1000001,P00087842,0,1422.0,train,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000001,P00085442,0,1057.0,train,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000002,P00285442,1,7969.0,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
data_one_hot.shape

(783667, 95)

In [22]:
data_one_hot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 783667 entries, 0 to 233598
Data columns (total 95 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   User_ID                        783667 non-null  int64  
 1   Product_ID                     783667 non-null  object 
 2   Gender                         783667 non-null  int32  
 3   Purchase                       550068 non-null  float64
 4   source                         783667 non-null  object 
 5   Age_0-17                       783667 non-null  uint8  
 6   Age_18-25                      783667 non-null  uint8  
 7   Age_26-35                      783667 non-null  uint8  
 8   Age_36-45                      783667 non-null  uint8  
 9   Age_46-50                      783667 non-null  uint8  
 10  Age_51-55                      783667 non-null  uint8  
 11  Age_55+                        783667 non-null  uint8  
 12  Occupation_0                  

In [23]:
train_preprocessed = data_one_hot[data_one_hot['source'] == 'train'].drop(columns = 'source')
test_preprocessed = data_one_hot[data_one_hot['source'] == 'test'].drop(columns = ['source'])

In [24]:
train_preprocessed.head()

Unnamed: 0,User_ID,Product_ID,Gender,Purchase,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000001,P00069042,0,8370.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000001,P00248942,0,15200.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1000001,P00087842,0,1422.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000001,P00085442,0,1057.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000002,P00285442,1,7969.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
test_preprocessed = test_preprocessed.drop(columns = ['Purchase'])

In [26]:
test_preprocessed.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000004,P00128942,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000009,P00113442,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1000010,P00288442,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000010,P00145342,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000011,P00053842,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [27]:
x = train_preprocessed.drop(columns = ['Product_ID','Purchase'])

In [28]:
x.head()

Unnamed: 0,User_ID,Gender,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,Age_55+,Occupation_0,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000001,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000001,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1000001,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000001,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000002,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
y = train_preprocessed['Purchase']

In [30]:
y.head()

0     8370.0
1    15200.0
2     1422.0
3     1057.0
4     7969.0
Name: Purchase, dtype: float64

In [31]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 32)

In [32]:
model = LinearRegression()

In [33]:
model.fit(train_x, train_y)

LinearRegression()

In [34]:
model.coef_

array([ 9.24970060e-03, -5.10644819e+01, -3.93632362e+01, -1.91997638e+02,
       -1.18746812e+02, -1.05522625e+01,  9.18450957e+00,  2.27976852e+02,
        1.23498587e+02, -6.63954890e+01, -1.26443220e+02, -3.40480676e+01,
        1.82107885e+02,  7.76391431e+01, -1.11389151e+01,  1.41995124e+02,
        3.72060368e+01, -4.17099182e+02,  5.02417811e+01, -8.53029594e+01,
        5.75672713e+01,  2.08374156e+02,  1.56180272e+01,  1.17055855e+02,
        2.78330563e+02,  4.46856240e+01,  1.20356767e+02, -7.49403677e+01,
       -3.53161933e+02, -1.62648098e+02, -2.26567345e+02, -9.36454924e+01,
        3.20212837e+02, -1.50492407e+01, -1.78527275e+01,  2.80459322e+01,
       -8.41379357e+00,  1.32698296e+01,  2.18885194e+01, -2.18885194e+01,
        4.34082083e+03,  2.59207179e+03,  2.14066817e+03, -6.58588559e+03,
       -2.93415149e+03,  6.48489461e+03,  7.31935922e+03, -1.67316874e+03,
        6.49789105e+03,  1.05739801e+04, -4.38354806e+03, -7.81554109e+03,
       -8.42656617e+03,  

In [35]:
model.intercept_

-77.99615181593254

### Train_prediction and Test_prediction

In [36]:
train_x_pred = model.predict(train_x)
test_x_pred = model.predict(test_x)

train_x_RMSE = np.sqrt(mean_squared_error(train_y, train_x_pred))
test_x_RMSE = np.sqrt(mean_squared_error(test_y, test_x_pred))

In [37]:
print("Train RMSE : ",train_x_RMSE)
print("Test RMSE : ",test_x_RMSE)

Train RMSE :  2977.9148461493673
Test RMSE :  2991.5717478867036


In [38]:
r2_score_train = r2_score(train_y, train_x_pred)
r2_score_test = r2_score(test_y, test_x_pred)
print(r2_score_train)
print(r2_score_test)

0.6490301083091778
0.6432603340287755


In [70]:
test_pred = model.predict(test_preprocessed.drop(columns = ['Product_ID']))

In [71]:
test_pred_modified = np.where(test_pred<0,0,test_pred)

In [72]:
test['Purchase'] = test_pred_modified

In [77]:
Output = test[["User_ID", "Product_ID", "Purchase"]]

In [78]:
Output.to_csv("Result_Blackfridaysales.csv", index = False)

In [75]:
Output.head()

Unnamed: 0,User_ID,Product_ID,Purchase
0,1000004,P00128942,13198.616897
1,1000009,P00113442,10975.19233
2,1000010,P00288442,5861.717364
3,1000010,P00145342,2201.454565
4,1000011,P00053842,2374.317555
