## 1.Initial data prep section.  Read, clean and create sets.

### Importing required modules

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

### Reading Dataset

In [52]:
df = pd.read_csv('BlackFridaySales.csv')
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


### Cleaning Dataset

### Encoding the categorical variables

In [53]:
from sklearn.preprocessing import LabelEncoder
lr = LabelEncoder()

In [54]:
df['Gender'] = lr.fit_transform(df['Gender'])
# Converting all the F values to numeic 0 and M values to numeric 1 in 'Gender' column

In [55]:
df['Age'] = lr.fit_transform(df['Age'])
# Converting all the values in 'Age' column to numeric as per the age range defined

In [56]:
df['City_Category'] = lr.fit_transform(df['City_Category'])
# Converting all the values in 'City_Category' column to numeric 

In [57]:
df['Stay_In_Current_City_Years'] = lr.fit_transform(df['Stay_In_Current_City_Years'])
# Converting all the values in 'Stay_In_Current_City_Years' column to numeric 

In [58]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,0,0,10,0,2,0,3,,,8370
1,1000001,P00248942,0,0,10,0,2,0,1,6.0,14.0,15200
2,1000001,P00087842,0,0,10,0,2,0,12,,,1422
3,1000001,P00085442,0,0,10,0,2,0,12,14.0,,1057
4,1000002,P00285442,1,6,16,2,4,0,8,,,7969


### Removing NULL values

In [59]:
df['Product_Category_2'] = df['Product_Category_2'].fillna(-2.0).astype("float32")
df['Product_Category_3'] = df['Product_Category_3'].fillna(-2.0).astype("float32")
# Replacing the NULL values with a negative value, so that the results will not get affected.

In [60]:
df.isnull().sum()
#checking the NULL values

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550068 entries, 0 to 550067
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   User_ID                     550068 non-null  int64  
 1   Product_ID                  550068 non-null  object 
 2   Gender                      550068 non-null  int32  
 3   Age                         550068 non-null  int32  
 4   Occupation                  550068 non-null  int64  
 5   City_Category               550068 non-null  int32  
 6   Stay_In_Current_City_Years  550068 non-null  int32  
 7   Marital_Status              550068 non-null  int64  
 8   Product_Category_1          550068 non-null  int64  
 9   Product_Category_2          550068 non-null  float32
 10  Product_Category_3          550068 non-null  float32
 11  Purchase                    550068 non-null  int64  
dtypes: float32(2), int32(4), int64(5), object(1)
memory usage: 37.8+ MB


In [62]:
df.nunique()

User_ID                        5891
Product_ID                     3631
Gender                            2
Age                               7
Occupation                       21
City_Category                     3
Stay_In_Current_City_Years        5
Marital_Status                    2
Product_Category_1               20
Product_Category_2               18
Product_Category_3               16
Purchase                      18105
dtype: int64

### Dropping the irrelevant columns

In [63]:
df = df.drop(["User_ID","Product_ID"],axis=1)
#  Dropping 'User_ID' and 'Product_ID' as it has more unique values.This will help in acheiving more accurate model prediction.

### Splitting data into independent and dependent variables

In [64]:
X = df.drop("Purchase",axis=1)

In [65]:
y=df['Purchase']

### Creating training and testing datasets

In [66]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.25)
print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

x_train shape: (412551, 9)
x_test shape: (137517, 9)
y_train shape: (412551,)
y_test shape: (137517,)
