# Import Libs

In [31]:
import random
import pandas as pd
import numpy as np


# 0. Preparation (Setting the Random State)

In [32]:
rs = min(289456, 274211)
np.random.seed(rs)

# Exercise 1 (Loading and Preparing the Data)

In [33]:
# 1.1 Load the CSV file into a pandas DataFrame
df_tot = pd.read_csv('cla4lsp_customers.csv', sep='\t')

# 1.2 Create a sub-DF workdf with 2/3 of the original dataframe's rows (randomly sampled)
workdf = df_tot.sample(frac=2 / 3, random_state=rs)
# 1.3 hard-code labels and feature
labels = ['NumDealsPurchases', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
          'AcceptedCmp5', 'Recency', 'Complain', 'Recency']
features = ['Education', 'Marital_Status', 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'MntWines',
            'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases',
            'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

# 1.4 Remove one feature column randomly from spending or purchasing habits
spending_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                    'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases',
                    'NumCatalogPurchases', 'NumStorePurchases']
column_to_remove = np.random.choice(spending_columns)
print("Column to be dropped: ", column_to_remove)
features.remove(column_to_remove)
workdf.drop([column_to_remove], axis=1, inplace=True)

Column to be dropped:  MntSweetProducts


In [34]:
workdf.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 17
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

Since we have missing values in the income column we drop the missing values

In [35]:
# 1.5 Clean the dataset from missing values in the feature columns
workdf.dropna(inplace=True)
workdf.isnull().sum()


ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64

# Exercise 2 (Encoding of Categorical Data)

In [36]:
categorical_columns = list(workdf.select_dtypes(include=['object']).columns)
categorical_columns

['Education', 'Marital_Status', 'Dt_Customer']

In [37]:
for col in categorical_columns:
    unique_values = workdf[col].unique()
    print(f"Number of Unique values in '{col}':\n{len(unique_values)}\n")


Number of Unique values in 'Education':
5

Number of Unique values in 'Marital_Status':
8

Number of Unique values in 'Dt_Customer':
609


1. Education:
Suggested Encoding Method: Ordinal Encoding or One-Hot Encoding
Reasoning:
If there is an inherent order in the education levels (e.g., "High School" < "Bachelor" < "Master"), you might consider using Ordinal Encoding.
If there is no specific order, and each education level is independent, One-Hot Encoding is often preferred.
2. Marital_Status:
Suggested Encoding Method: One-Hot Encoding
Reasoning:
Marital status typically doesn't have a natural order, making One-Hot Encoding a suitable choice.
3. Dt_Customer:
Suggested Encoding Method: Ordinal Encoding (if there's an order) or Feature Engineering
Since this column represents dates, encoding methods are a bit different compared to traditional categorical variables. Instead of directly encoding the date, we'll extract useful features from it. Common features to extract include the year, month, day, and potentially others like day of the week.


## Encoding for Dt_Customer:

In [38]:
workdf['Dt_Customer'] = pd.to_datetime(workdf['Dt_Customer'],format="%d-%m-%Y")
workdf['Dt_Customer_Year'] = workdf['Dt_Customer'].dt.year
workdf['Dt_Customer_Month'] = workdf['Dt_Customer'].dt.month
workdf['Dt_Customer_Day'] = workdf['Dt_Customer'].dt.day
workdf['Dt_Customer_DayOfWeek'] = workdf['Dt_Customer'].dt.dayofweek

# now that we're done we can drop Dt_Customer
workdf.drop("Dt_Customer",axis=1)
workdf.columns

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,...,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Dt_Customer_Year,Dt_Customer_Month,Dt_Customer_Day,Dt_Customer_DayOfWeek
335,10151,1960,Graduation,Divorced,62204.0,0,2,38,317,46,...,0,0,0,3,11,0,2012,9,12,2
798,3749,1973,Graduation,Together,73926.0,0,0,54,627,91,...,0,0,0,3,11,0,2013,3,2,5
1707,1045,1965,Graduation,Together,52117.0,0,1,55,112,10,...,0,0,0,3,11,0,2012,8,16,3
999,9097,1956,Graduation,Divorced,46086.0,0,1,34,244,8,...,0,0,0,3,11,0,2013,11,3,6
1947,2495,1974,Master,Married,83891.0,0,1,24,217,38,...,0,0,0,3,11,0,2014,6,20,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1750,3571,1961,PhD,Together,63342.0,0,1,48,918,21,...,0,0,0,3,11,1,2012,10,16,1
134,5290,1964,PhD,Married,41551.0,1,1,51,220,0,...,0,0,0,3,11,0,2013,8,14,2
513,6036,1959,Master,Together,89120.0,0,0,78,1168,92,...,0,0,0,3,11,0,2014,3,17,0
2107,340,1970,Graduation,Divorced,72967.0,0,1,1,158,35,...,0,0,0,3,11,1,2012,12,15,5


In [39]:
categorical_cols = ['Education', 'Marital_Status']

# Apply one-hot encoding
workdf = pd.get_dummies(workdf, columns=categorical_cols, drop_first=True)

In [40]:
workdf.head()

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,...,Education_Graduation,Education_Master,Education_PhD,Marital_Status_Alone,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Marital_Status_YOLO
335,10151,1960,62204.0,0,2,2012-09-12,38,317,46,247,...,True,False,False,False,True,False,False,False,False,False
798,3749,1973,73926.0,0,0,2013-03-02,54,627,91,597,...,True,False,False,False,False,False,False,True,False,False
1707,1045,1965,52117.0,0,1,2012-08-16,55,112,10,107,...,True,False,False,False,False,False,False,True,False,False
999,9097,1956,46086.0,0,1,2013-11-03,34,244,8,32,...,True,False,False,False,True,False,False,False,False,False
1947,2495,1974,83891.0,0,1,2014-06-20,24,217,38,350,...,False,True,False,False,False,True,False,False,False,False
