## 1.Initial data prep section.  Read, clean and create sets.

### Importing required modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline

warnings.filterwarnings('ignore')

### Reading Dataset

In [7]:
data = pd.read_csv('BlackFridaySalesResized.csv')
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


### Statistical info

In [8]:
data.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,159999.0,159999.0,159999.0,159999.0,110328.0,48940.0,159999.0
mean,1002950.0,8.076107,0.409646,5.292614,9.852232,12.665529,9313.634785
std,1760.768,6.528858,0.49177,3.740822,5.085515,4.120158,4981.016028
min,1000001.0,0.0,0.0,1.0,2.0,3.0,185.0
25%,1001384.0,2.0,0.0,1.0,5.0,9.0,5852.0
50%,1002994.0,7.0,0.0,5.0,9.0,14.0,8056.0
75%,1004439.0,14.0,1.0,8.0,15.0,16.0,12064.0
max,1006040.0,20.0,1.0,18.0,18.0,18.0,23961.0


### Finding unique values

In [9]:
data.apply(lambda x: len(x.unique()))

User_ID                        5883
Product_ID                     3429
Gender                            2
Age                               7
Occupation                       21
City_Category                     3
Stay_In_Current_City_Years        5
Marital_Status                    2
Product_Category_1               18
Product_Category_2               18
Product_Category_3               16
Purchase                      15228
dtype: int64

### Checking the NULL values in the data

In [10]:
data.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2             49671
Product_Category_3            111059
Purchase                           0
dtype: int64

### Making a copy of the dataframe

In [11]:
df = data.copy()

### Converting categorical variable into dummy variables

In [14]:
df = pd.get_dummies(df, columns=['Stay_In_Current_City_Years'])

### Encoding the categorical variables

In [15]:
from sklearn.preprocessing import LabelEncoder
lr = LabelEncoder()

In [16]:
df['Gender'] = lr.fit_transform(df['Gender'])

In [17]:
df['Age'] = lr.fit_transform(df['Age'])

In [18]:
df['City_Category'] = lr.fit_transform(df['City_Category'])

In [19]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+
0,1000001,P00069042,0,0,10,0,0,3,,,8370,0,0,1,0,0
1,1000001,P00248942,0,0,10,0,0,1,6.0,14.0,15200,0,0,1,0,0
2,1000001,P00087842,0,0,10,0,0,12,,,1422,0,0,1,0,0
3,1000001,P00085442,0,0,10,0,0,12,14.0,,1057,0,0,1,0,0
4,1000002,P00285442,1,6,16,2,0,8,,,7969,0,0,0,0,1


In [20]:
df['Product_Category_2'] =df['Product_Category_2'].fillna(0).astype('int64')
df['Product_Category_3'] =df['Product_Category_3'].fillna(0).astype('int64')

In [23]:
df.isnull().sum()

User_ID                          0
Product_ID                       0
Gender                           0
Age                              0
Occupation                       0
City_Category                    0
Marital_Status                   0
Product_Category_1               0
Product_Category_2               0
Product_Category_3               0
Purchase                         0
Stay_In_Current_City_Years_0     0
Stay_In_Current_City_Years_1     0
Stay_In_Current_City_Years_2     0
Stay_In_Current_City_Years_3     0
Stay_In_Current_City_Years_4+    0
dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159999 entries, 0 to 159998
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype 
---  ------                         --------------   ----- 
 0   User_ID                        159999 non-null  int64 
 1   Product_ID                     159999 non-null  object
 2   Gender                         159999 non-null  int32 
 3   Age                            159999 non-null  int32 
 4   Occupation                     159999 non-null  int64 
 5   City_Category                  159999 non-null  int32 
 6   Marital_Status                 159999 non-null  int64 
 7   Product_Category_1             159999 non-null  int64 
 8   Product_Category_2             159999 non-null  int64 
 9   Product_Category_3             159999 non-null  int64 
 10  Purchase                       159999 non-null  int64 
 11  Stay_In_Current_City_Years_0   159999 non-null  uint8 
 12  Stay_In_Current_City_Years_1   159999 non-nu

### Dropping the irrelevant columns

In [24]:
df = df.drop(["User_ID","Product_ID"],axis=1)

### Splitting data into independent and dependent variables

In [26]:
X = df.drop("Purchase",axis=1)

In [27]:
y=df['Purchase']

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (111999, 13)
X_test shape: (48000, 13)
y_train shape: (111999,)
y_test shape: (48000,)


## 2. Explaining the choices of features for X and the target feature y

Features : All except target <br>
Target: Will predict which of the features is dependent on 'Purchase' and upto which extent.<br><br>

Based on the features, we are going to predict how much the customers will spend during Black Friday, using various features such as age, gender, marital status.<br>
The dataset is split into training data and testing data in the ratio 70:30 using the train_test_split() command.