In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.utils import resample

# importing ploting libraries
import matplotlib.pyplot as plt 

#importing seaborn for statistical plots
import seaborn as sns
%matplotlib inline
# Suppress Warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
customer_data = pd.read_csv('BlackFriday.csv')

In [3]:
customer_data = customer_data.drop(columns=['Product_Category_2', 'Product_Category_3'], inplace=False,axis=1)

In [36]:
del customer

In [37]:
customer = customer_data.copy()

In [38]:
customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 10 columns):
User_ID                       537577 non-null int64
Product_ID                    537577 non-null object
Gender                        537577 non-null object
Age                           537577 non-null object
Occupation                    537577 non-null int64
City_Category                 537577 non-null object
Stay_In_Current_City_Years    537577 non-null object
Marital_Status                537577 non-null int64
Product_Category_1            537577 non-null int64
Purchase                      537577 non-null int64
dtypes: int64(5), object(5)
memory usage: 41.0+ MB


## Feature Engineering

In [39]:
customer['User_ID'] = customer['User_ID'].astype('str', copy=False)
customer['Occupation'] = customer['Occupation'].astype('str', copy=False)
customer['Marital_Status'] =  customer['Marital_Status'].map({0 : 'UnMarried', 1: 'Married'})
customer['Product_Category_1'] = customer['Product_Category_1'].astype('str', copy=False)

# Setting all the categorical columns to type category
for col in set(customer.columns) - set(customer.describe().columns):
    customer[col] = customer[col].astype('category')
    

print(customer.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 10 columns):
User_ID                       537577 non-null category
Product_ID                    537577 non-null category
Gender                        537577 non-null category
Age                           537577 non-null category
Occupation                    537577 non-null category
City_Category                 537577 non-null category
Stay_In_Current_City_Years    537577 non-null category
Marital_Status                537577 non-null category
Product_Category_1            537577 non-null category
Purchase                      537577 non-null int64
dtypes: category(9), int64(1)
memory usage: 10.1 MB
None


## Ordinal Transformation of **AGE** and **Stay_In_Current_City_Years**

In [40]:
# Giving Age Numerical values
age_dict = {'0-17':1, '18-25':2, '26-35':3, '36-45':4, '46-50':5, '51-55':6, '55+':7}
customer["Age"] = customer["Age"].apply(lambda line: age_dict[line])

customer["Age"].value_counts()

3    214690
4    107499
2     97634
5     44526
6     37618
7     20903
1     14707
Name: Age, dtype: int64

In [41]:
stay_dict = {'0': 0, '1':1, '2':2, '3':3, '4+':5 }

customer['Stay_In_Current_City_Years'] = customer['Stay_In_Current_City_Years'].apply(lambda line: stay_dict[line])

In [42]:
customer["Age"] = customer["Age"].astype('int32', copy=False)

customer['Stay_In_Current_City_Years'] = customer['Stay_In_Current_City_Years'].astype('int32', copy=False)

# Frequency Count for each Category

In [43]:
# feature representing the count of each user
def getCountVar(compute_df, count_df, var_name):
    grouped_df = count_df.groupby(var_name)
    count_dict = {}
    for name, group in grouped_df:
        count_dict[name] = group.shape[0]

    count_list = []
    for index, row in compute_df.iterrows():
        name = row[var_name]
        count_list.append(count_dict.get(name, 0))
    return count_list



In [44]:
customer['ID_Counts'] = getCountVar(customer, customer, 'User_ID')
customer['Product_ID_Counts'] = getCountVar(customer, customer, 'Product_ID')
customer['Age_Counts'] = getCountVar(customer, customer, 'Age')
customer['Gender_Counts'] = getCountVar(customer, customer, 'Gender')
customer['Occupation_Counts'] = getCountVar(customer, customer, 'Occupation')
customer['City_Category_Counts'] = getCountVar(customer, customer, 'City_Category')
customer['Stay_In_Current_City_Years_Counts'] = getCountVar(customer, customer, 'Stay_In_Current_City_Years')
customer['Marital_Status_Years_Counts'] = getCountVar(customer, customer, 'Marital_Status')
customer['Product_Category_1_Counts'] = getCountVar(customer, customer, 'Product_Category_1')

## Polychotomization

In [45]:
# Setting all the categorical columns to type category
for col in set(customer.columns) - set(customer.describe().columns):
    customer[col] = customer[col].astype('category')
    

print(customer.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537577 entries, 0 to 537576
Data columns (total 19 columns):
User_ID                              537577 non-null category
Product_ID                           537577 non-null category
Gender                               537577 non-null category
Age                                  537577 non-null int32
Occupation                           537577 non-null category
City_Category                        537577 non-null category
Stay_In_Current_City_Years           537577 non-null int32
Marital_Status                       537577 non-null category
Product_Category_1                   537577 non-null category
Purchase                             537577 non-null int64
ID_Counts                            537577 non-null int64
Product_ID_Counts                    537577 non-null int64
Age_Counts                           537577 non-null int64
Gender_Counts                        537577 non-null int64
Occupation_Counts                    53757

In [47]:
features = list(customer.columns)
features.remove('Purchase')
features

['Gender',
 'Age',
 'Occupation',
 'City_Category',
 'Stay_In_Current_City_Years',
 'Marital_Status',
 'Product_Category_1',
 'ID_Counts',
 'Product_ID_Counts',
 'Age_Counts',
 'Gender_Counts',
 'Occupation_Counts',
 'City_Category_Counts',
 'Stay_In_Current_City_Years_Counts',
 'Marital_Status_Years_Counts',
 'Product_Category_1_Counts']

## Dropping User ID and Product ID

In [46]:
customer = customer.drop(columns=['User_ID','Product_ID'], inplace=False, axis=1)

In [28]:
customer.index = np.arange(len(customer))

In [48]:
customer_polychot = pd.get_dummies(data=customer[features], drop_first=True)

## Modeling

In [49]:
x = customer_polychot.copy()
y = customer_data['Purchase']

In [50]:
## Adjusted R^2
def AdjRsquare(model, x, y):
    Rsquare = model.score(x, y)
    NoData = len(y)
    p = x.shape[1]
    tempRsquare = 1 - (1-Rsquare)*(NoData - 1
                                  )/(NoData - p - 1)
    return tempRsquare

In [51]:
## Combine all the steps to test the model performance
def linRegcheckModelPerformance(x, y):
    model = LinearRegression()
    # Covert data into train and test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 666, shuffle= True)
    # Build model with train data set
    model.fit(x_train, y_train)
    # Train accuracies
    trainR2 = model.score(x_train, y_train)
    predictedPurchaseTrain= model.predict(x_train)
    TrainMse = metrics.mean_squared_error(y_train, predictedPurchaseTrain)
    trainRmse = np.sqrt(TrainMse)
    trainRmsePct = trainRmse/np.mean(np.mean(np.array(y_train)))*100
    trainAdjR2 = AdjRsquare(model, x_train, y_train)
    trainAccuracies = [trainRmse, trainRmsePct, trainR2, trainAdjR2]
    # Test accuracies
    testR2 = model.score(x_test, y_test)
    predictedPurchaseTest = model.predict(x_test)
    TestMse = metrics.mean_squared_error( y_test, predictedPurchaseTest)
    testRmse = np.sqrt(TestMse)
    testRmsePct = testRmse/np.mean(np.mean(np.array(y_test)))*100
    testAdjR2 = AdjRsquare(model, x_test, y_test)
    testAccuracies = [testRmse, testRmsePct, testR2, testAdjR2]
    # Create dataframe for results
    resultsDf = pd.DataFrame(index = ["rmse", "rmsePct", "r2", "adjR2"])
    resultsDf['trainAccuracy'] = trainAccuracies
    resultsDf['testAccuracy'] = testAccuracies
    return ( round(resultsDf, 2))

In [52]:
linRegcheckModelPerformance(x,y)

Unnamed: 0,trainAccuracy,testAccuracy
rmse,2903.84,2906.96
rmsePct,31.12,31.09
r2,0.66,0.66
adjR2,0.66,0.66


# Regularization

In [53]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 666, shuffle= True)

In [67]:
# Test options and evaluation metric
num_folds = 10
seed = 666
scoring = 'neg_mean_squared_error'

# Algorithms to be compared
models = []
models.append(('Ridge', Ridge()))
models.append(('LASSO', Lasso()))
models.append(('Elastic Net', ElasticNet()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    rmseResults = np.sqrt(-cv_results)
    priceMean = np.float64(y_train.mean())
    rmsePercent = rmseResults/priceMean*100
    results.append(rmsePercent)
    names.append(name)
    msg = "%s: %f (%f)" % (name, rmseResults.mean(), rmseResults.std())
    print(msg)

Ridge: 2904.220361 (9.699934)
LASSO: 2904.763615 (9.815218)
Elastic Net: 4411.038771 (16.609825)


## Model Performance and Hypertuning

In [79]:
finalmodel = Ridge(random_state=666)

In [85]:
params = dict({'alpha':[0.1,0.25,0.35,0.45,0.5,0.75,0.8,0.85,1,10],'tol':[0.000001,0.00001,0.0001,0.001,0.01]})

In [86]:
GridRidge = GridSearchCV(finalmodel,param_grid=params, cv=5, scoring=scoring, n_jobs=-1)

In [87]:
GridRidge.fit(x_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=666, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'alpha': [0.1, 0.25, 0.35, 0.45, 0.5, 0.75, 0.8, 0.85, 1, 10], 'tol': [1e-06, 1e-05, 0.0001, 0.001, 0.01]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [88]:
GridRidge.best_params_

{'alpha': 0.75, 'tol': 1e-06}

In [89]:
## Choosing the final model

In [92]:
## Combine all the steps to test the model performance
def FinalModelPerformance(x, y):
    model = Ridge(alpha=0.75, tol=1e-06, random_state=666)
    # Covert data into train and test
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state = 666, shuffle= True)
    # Build model with train data set
    model.fit(x_train, y_train)
    # Train accuracies
    trainR2 = model.score(x_train, y_train)
    predictedPurchaseTrain= model.predict(x_train)
    TrainMse = metrics.mean_squared_error(y_train, predictedPurchaseTrain)
    trainRmse = np.sqrt(TrainMse)
    trainRmsePct = trainRmse/np.mean(np.mean(np.array(y_train)))*100
    trainAdjR2 = AdjRsquare(model, x_train, y_train)
    trainAccuracies = [trainRmse, trainRmsePct, trainR2, trainAdjR2]
    # Test accuracies
    testR2 = model.score(x_test, y_test)
    predictedPurchaseTest = model.predict(x_test)
    TestMse = metrics.mean_squared_error( y_test, predictedPurchaseTest)
    testRmse = np.sqrt(TestMse)
    testRmsePct = testRmse/np.mean(np.mean(np.array(y_test)))*100
    testAdjR2 = AdjRsquare(model, x_test, y_test)
    testAccuracies = [testRmse, testRmsePct, testR2, testAdjR2]
    # Create dataframe for results
    resultsDf = pd.DataFrame(index = ["rmse", "rmsePct", "r2", "adjR2"])
    resultsDf['trainAccuracy'] = trainAccuracies
    resultsDf['testAccuracy'] = testAccuracies
    return ( round(resultsDf, 2))

In [93]:
FinalModelPerformance(x, y)

Unnamed: 0,trainAccuracy,testAccuracy
rmse,2903.84,2906.96
rmsePct,31.12,31.09
r2,0.66,0.66
adjR2,0.66,0.66
