In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings("ignore")

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_csv("./data/black_friday_dataset_cleaned.csv")
df

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,Female,0-17,10,A,2,Unmarried,3,17,15,8370
1,Female,0-17,10,A,2,Unmarried,1,5,11,15200
2,Female,0-17,10,A,2,Unmarried,12,17,2,1422
3,Female,0-17,10,A,2,Unmarried,12,13,7,1057
4,Male,55+,16,C,3+,Unmarried,8,6,8,7969
...,...,...,...,...,...,...,...,...,...,...
547512,Male,51-55,13,B,1,Married,20,17,1,368
547513,Female,26-35,1,C,3,Unmarried,20,9,1,371
547514,Female,26-35,15,B,3+,Married,20,8,7,137
547515,Female,55+,1,C,2,Unmarried,20,11,2,365


In [4]:
get_unique_values = lambda col: df[col].unique()

In [5]:
for column_name in df.columns[:-1]:
    print(f"Column Name: {column_name}")
    print(f"Length: {len(get_unique_values(column_name))}")
    print(f"Unique Values: {sorted(get_unique_values(column_name))}")
    print()

Column Name: Gender
Length: 2
Unique Values: ['Female', 'Male']

Column Name: Age
Length: 7
Unique Values: ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']

Column Name: Occupation
Length: 21
Unique Values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

Column Name: City_Category
Length: 3
Unique Values: ['A', 'B', 'C']

Column Name: Stay_In_Current_City_Years
Length: 5
Unique Values: ['0', '1', '2', '3', '3+']

Column Name: Marital_Status
Length: 2
Unique Values: ['Married', 'Unmarried']

Column Name: Product_Category_1
Length: 20
Unique Values: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

Column Name: Product_Category_2
Length: 17
Unique Values: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

Column Name: Product_Category_3
Length: 15
Unique Values: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]



In [6]:
def custom_label_encoder(col, dictionary): df[col] = df[col].map(dictionary)

In [7]:
custom_label_encoder('Occupation', {0: 1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9, 9:10, 10:11, 11:12, 12:13, 13:14, 14:15, 15:16, 16:17, 17:18, 18:19, 19:20, 20:21})

In [8]:
# Making a copy of the original dataframe

In [9]:
# new_df = df.copy(deep=True)
# new_df

In [10]:
df.columns

Index(['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

In [11]:
subset_for_one_hot_encoding = df[['Gender', 'Occupation', 'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']]

In [12]:
def one_hot_encode_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the DataFrame to avoid modifying the original
    df_encoded = df.copy()

    # Instantiate the OneHotEncoder
    encoder = OneHotEncoder(sparse=False)

    for column in df.columns:
        # Fit and transform the current column
        encoded_data = encoder.fit_transform(df[[column]])

        # Get the feature names for the one-hot encoded columns
        feature_names = encoder.get_feature_names_out([column])

        # Create a DataFrame from the encoded data with the appropriate column names
        df_encoded = pd.concat([df_encoded, pd.DataFrame(encoded_data, columns=feature_names)], axis=1)

        # Drop the original column
        df_encoded.drop([column], axis=1, inplace=True)

    return df_encoded

In [13]:
result_of_one_hot_encoding = one_hot_encode_columns(subset_for_one_hot_encoding)

In [14]:
result_of_one_hot_encoding.shape

(547517, 77)

In [15]:
subset_for_label_encoding = df[['Age', 'City_Category', 'Stay_In_Current_City_Years']]

In [16]:
def label_encode_columns(df):
    """
    Label encodes all categorical columns in a DataFrame and replaces the original columns.

    Parameters:
    - df (pd.DataFrame): Input DataFrame.

    Returns:
    - pd.DataFrame: DataFrame with categorical columns replaced by their label encoded representations.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_encoded = df.copy()

    # Instantiate the LabelEncoder
    encoder = LabelEncoder()

    for column in df.columns:
        # Fit and transform the current column
        encoded_data = encoder.fit_transform(df[column])

        # Create a new column with the label encoded data
        df_encoded[column + '_encoded'] = encoded_data

        # Drop the original column
        df_encoded.drop([column], axis=1, inplace=True)

    return df_encoded

In [17]:
result_of_label_encoding = label_encode_columns(subset_for_label_encoding)

In [18]:
result_of_label_encoding.shape

(547517, 3)

In [19]:
result_of_one_hot_encoding.dtypes

Gender_Female            float64
Gender_Male              float64
Occupation_1             float64
Occupation_2             float64
Occupation_3             float64
                          ...   
Product_Category_3_11    float64
Product_Category_3_12    float64
Product_Category_3_13    float64
Product_Category_3_14    float64
Product_Category_3_15    float64
Length: 77, dtype: object

In [20]:
dataset = pd.concat([result_of_one_hot_encoding, result_of_label_encoding, df['Purchase']], axis=1)

In [21]:
dataset.to_csv('./data/black_friday_dataset_encoded.csv', index=False)

In [22]:
# ---------------------------
# Implement Linear Regression

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [24]:
ml_dataset = pd.read_csv('./data/black_friday_dataset_encoded.csv')

In [25]:
ml_dataset.head()

Unnamed: 0,Gender_Female,Gender_Male,Occupation_1,Occupation_2,Occupation_3,Occupation_4,Occupation_5,Occupation_6,Occupation_7,Occupation_8,...,Product_Category_3_10,Product_Category_3_11,Product_Category_3_12,Product_Category_3_13,Product_Category_3_14,Product_Category_3_15,Age_encoded,City_Category_encoded,Stay_In_Current_City_Years_encoded,Purchase
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0,0,2,8370
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0,0,2,15200
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,1422
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,2,1057
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6,2,4,7969


In [26]:
# ============================================
# LINEAR REGRESSION WITHOUT LOG TRANSFORMATION

# Separate features (X) and target variable (y)
X = ml_dataset.drop('Purchase', axis=1)
y = ml_dataset['Purchase']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Instantiate the Linear Regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

Root Mean Squared Error (RMSE): 2971.157090973691
R-squared (R2): 0.6388079560357095


In [27]:
# =========================================
# LINEAR REGRESSION WITH LOG TRANSFORMATION

# Apply log transformation to 'Purchase'
ml_dataset['Purchase_log'] = np.log1p(ml_dataset['Purchase'])

# Separate features (X) and target variable (y)
X = ml_dataset.drop(['Purchase', 'Purchase_log'], axis=1)
y = ml_dataset['Purchase_log']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Instantiate the Linear Regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

Root Mean Squared Error (RMSE): 0.3776450885289551
R-squared (R2): 0.7389139509070972


In [28]:
ml_dataset = pd.read_csv('./data/black_friday_dataset_encoded.csv')

In [29]:
# =============================================
# RANDOMFORESTREGRESSOR WITHOUT FEATURE SCALING

# Separate features (X) and target variable (y)
X = ml_dataset.drop('Purchase', axis=1)
y = ml_dataset['Purchase']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Instantiate the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

# Now, you can use the trained model to make predictions on new data
# For example, if you have a new_data DataFrame with the same columns as X (excluding 'Purchase'):
# new_predictions = model.predict(new_data)

Root Mean Squared Error (RMSE): 3095.5742863149735
R-squared (R2): 0.7389139509070972


In [30]:
# ==========================================
# RANDOMFORESTREGRESSOR WITH FEATURE SCALING

# Apply log transformation to 'Purchase'
ml_dataset['Purchase_log'] = np.log1p(ml_dataset['Purchase'])

# Separate features (X) and target variable (y)
X = ml_dataset.drop(['Purchase', 'Purchase_log'], axis=1)
y = ml_dataset['Purchase_log']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Instantiate the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

# Now, you can use the trained model to make predictions on new data
# For example, if you have a new_data DataFrame with the same columns as X (excluding 'Purchase'):
# new_predictions = model.predict(new_data)

Root Mean Squared Error (RMSE): 0.3959826145659063
R-squared (R2): 0.7389139509070972
