# Import Libs

In [4]:
import random
import pandas as pd
import numpy as np


# 0. Preparation (Setting the Random State)

In [5]:
rs = min(289456, 274211)
np.random.seed(rs)

# Exercise 1 (Loading and Preparing the Data)

In [66]:
# 1.1 Load the CSV file into a pandas DataFrame
df_tot = pd.read_csv('cla4lsp_customers.csv', sep='\t')

# 1.2 Create a sub-DF workdf with 2/3 of the original dataframe's rows (randomly sampled)
workdf = df_tot.sample(frac=2 / 3, random_state=rs)
# 1.3 hard-code labels and feature
labels = ['NumDealsPurchases', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
          'AcceptedCmp5', 'Recency', 'Complain', 'Recency']
features = ['Education', 'Marital_Status', 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'MntWines',
            'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases',
            'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']

# 1.4 Remove one feature column randomly from spending or purchasing habits
spending_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                    'MntSweetProducts', 'MntGoldProds', 'NumWebPurchases',
                    'NumCatalogPurchases', 'NumStorePurchases']
column_to_remove = np.random.choice(spending_columns)
print("Column to be dropped: ", column_to_remove)
features.remove(column_to_remove)
workdf.drop([column_to_remove],axis=1,inplace=True)

Column to be dropped:  MntFruits


In [67]:
workdf.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 17
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

Since we have missing values in the income column we drop the missing values

In [68]:
# 1.5 Clean the dataset from missing values in the feature columns
workdf.dropna(inplace=True)
workdf.isnull().sum()


ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64

# Exercise 2 (Encoding of Categorical Data)

In [64]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1476 entries, 335 to 1224
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Education          1476 non-null   object 
 1   Marital_Status     1476 non-null   object 
 2   Year_Birth         1476 non-null   int64  
 3   Income             1476 non-null   float64
 4   Kidhome            1476 non-null   int64  
 5   Teenhome           1476 non-null   int64  
 6   Dt_Customer        1476 non-null   object 
 7   MntWines           1476 non-null   int64  
 8   MntFruits          1476 non-null   int64  
 9   MntMeatProducts    1476 non-null   int64  
 10  MntFishProducts    1476 non-null   int64  
 11  MntSweetProducts   1476 non-null   int64  
 12  MntGoldProds       1476 non-null   int64  
 13  NumWebPurchases    1476 non-null   int64  
 14  NumStorePurchases  1476 non-null   int64  
 15  NumWebVisitsMonth  1476 non-null   int64  
dtypes: float64(1), int64(12), o

In [65]:
workdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1493 entries, 335 to 1224
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1493 non-null   int64  
 1   Year_Birth           1493 non-null   int64  
 2   Education            1493 non-null   object 
 3   Marital_Status       1493 non-null   object 
 4   Income               1476 non-null   float64
 5   Kidhome              1493 non-null   int64  
 6   Teenhome             1493 non-null   int64  
 7   Dt_Customer          1493 non-null   object 
 8   Recency              1493 non-null   int64  
 9   MntWines             1493 non-null   int64  
 10  MntFruits            1493 non-null   int64  
 11  MntMeatProducts      1493 non-null   int64  
 12  MntFishProducts      1493 non-null   int64  
 13  MntSweetProducts     1493 non-null   int64  
 14  MntGoldProds         1493 non-null   int64  
 15  NumDealsPurchases    1493 non-null   int6