# Automobile customer segmentation classification
### Context
An automobile company has plans to enter new markets with their existing products (P1, P2, P3, P4, and P5). After intensive market research, they’ve deduced that the behavior of the new market is similar to their existing market.

In their existing market, the sales team has classified all customers into 4 segments (A, B, C, D). Then, they performed segmented outreach and communication for a different segment of customers. This strategy has work e exceptionally well for them. They plan to use the same strategy for the new markets and have identified 2627 new potential customers.

You are required to help the manager to predict the right group of the new customers.

### import data library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)

### data exploration

In [None]:
data_location = r'/kaggle/input/customer-segmentation/'
df_raw = pd.read_csv(f'{data_location}/Train.csv')
test_raw = pd.read_csv(f'{data_location}/Test.csv')
df_raw.shape

In [None]:
df_raw.head()

In [None]:
# create functions for data exploration
def plot_numeric(df, col):
    fig, ax = plt.subplots(1, 3, figsize=(18, 6))
    sns.histplot(data=df, x=col, ax=ax[0]).set_title(col+'_hist')
    sns.kdeplot(data=df, x=col, ax=ax[1]).set_title(col+'_kde')
    sns.boxplot(data=df, y=col, ax=ax[2]).set_title(col+'_box')
    plt.show()
    
def plot_category(df, col):
    fig, ax = plt.subplots(figsize=(18,12))
    sns.countplot(data=df, x=col, ax=ax, order=df[col].value_counts().index).set_title(col+'_count')
    plt.show()
    
def plot_corr(df):
    fig, ax = plt.subplots(figsize=(18,12))
    num_col = df.select_dtypes(include='number').columns.values
    sns.heatmap(df[num_col].corr(), vmin=-1, vmax=1,
                annot=True, square=True, ax=ax)
    plt.title('Correlation Matrix')
    plt.show()

def get_corr_list(df):
    num_col = df.select_dtypes(include='number').columns.values
    df_corr = df[num_col].corr().unstack().sort_values(ascending=False)\
            .drop_duplicates()
    print(df_corr)

def get_null(df):
    print(df.isna().sum())

def get_describe(df):
    print(df.describe(include='all'))

In [None]:
print('Training dataset')
get_null(df_raw)
print('Testing dataset')
get_null(test_raw)
print('-'*100, end='\n\n')

print('Training dataset')
get_describe(df_raw)
print('Testing dataset')
get_describe(test_raw)

print('-'*100, end='\n\n')
for col in df_raw.select_dtypes(include='number'):
    print(col)
    print('Training dataset')
    plot_numeric(df_raw, col)
    print('Testing dataset')
    plot_numeric(test_raw, col)

print('-'*100, end='\n\n')
for col in df_raw.select_dtypes(include='object'):
    print(col)
    print('Training dataset')
    plot_category(df_raw, col)
    print('Testing dataset')
    plot_category(test_raw, col)
    
print('-'*100, end='\n\n')
print('Training dataset')
plot_corr(df_raw)
print('Testing dataset')
plot_corr(test_raw)

### Summary of data exploration
After the above data exploration, I see a big difference between the sample behavior. It is believed that the sampling is not properly done. Hence, I believe the training set cannot represent the population behavior. 
1. Gender: Male is 10%~20% more than female in the training set while 10% female is more than male in the testing set
2. Ever_Married: Similar to Gender, married status takes over 60% of the population in training set while single dominate the testing set
3. Graduated: Similar to Gender and Ever_Married situation. Graduated occupied the training dataset and Non-graduate occupied the testing dataset

#### Points to take note for data cleansing
1. ID should be dropped as it is meaningless
2. Outlier on Age, Work_Experience and Family_Size
3. Data Cleansing on Gender, Ever_Married and Graduate should be careful. 

#### Points to take note for data modeling
1. Gender, Ever_married and Graduated could/might be dropped if the validation accuracy is higher but not the testing accuracy.
2. Age could use bin (Further checking is required)

### data cleansing
1. fill empty Ever_Married with No
2. fill empty Graduated with No
3. fill empty Profession with Artist (The most common one)
4. fill empty Var_1 with Cat_6 (The most common one)
5. fill empty Work_Experience with median then remove outliers by interquantile
6. fill empty Family_Size with median then remove outliers by interquantile
7. Drop ID
8. Map segment into numeric

In [None]:
def data_transform(df):
    # function is created for easy handle for both training and testing dataset
    def remove_outlier(df, col):
        lowq, highq = df[col].quantile(.25), df[col].quantile(.75)
        interq = 1.5 * (highq - lowq)
        lowq -= interq
        highq += interq
        return df[col].apply(lambda x: lowq if x < lowq else highq if x > highq else x)
    
    df['Ever_Married'] = df['Ever_Married'].fillna('No')
    df['Graduated'] = df['Graduated'].fillna('No')
    for col in ['Profession', 'Var_1']:
        df[col] = df[col].fillna(df[col].mode().values[0])
    for col in ['Work_Experience', 'Family_Size']:
        df[col] = df[col].fillna(df[col].median())
        df[col] = remove_outlier(df, col)
    df.drop(columns=['ID'], inplace=True)
    segment_map = {'A':1, 'B':2, 'C':3, 'D':4}
    df['Segmentation'] = df['Segmentation'].map(segment_map)
    for col in df.select_dtypes(exclude='number'):
        df[col] = df[col].apply(lambda x: str(x).strip())
    return df

In [None]:
cleanset = data_transform(df_raw.copy())
cleanset.shape

In [None]:
test_cleanset = data_transform(test_raw.copy())
test_cleanset.shape

In [None]:
#After I clear it up, I am going to do a data exploration again to review the distribution.
get_null(cleanset)
print('-'*100, end='\n\n')

get_describe(cleanset)

print('-'*100, end='\n\n')
for col in cleanset.select_dtypes(include='number'):
    print(col)
    plot_numeric(cleanset, col)

print('-'*100, end='\n\n')
for col in cleanset.select_dtypes(include='object'):
    print(col)
    plot_category(cleanset, col)
    
print('-'*100, end='\n\n')
plot_corr(cleanset)

### data modeling

In [None]:
# RandomForest, Logistic Regression, XGBoost, CatBoost will be validated to figure out the best algorithm for this part.
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import classification_report, plot_confusion_matrix

In [None]:
def data_model_cleansing(df):
    y = df['Segmentation']
    df.drop(columns=['Segmentation'], inplace=True)
    #Seperate checking have completed and find out the creation of Age bin will lose model accuracy. 
    #df['Age'] = pd.cut(df['Age'], bins=[0, 20, 30, 40, 50, 60, 70, 90], 
    #                   labels=['Group_' + str(i) for i in range(20, 81, 10)])
    for col in df.select_dtypes(exclude='number').columns.values:
        df = pd.concat([pd.get_dummies(df[col], prefix=col), df], axis=1)
        df.drop(columns=col, inplace=True)
    return df, y

In [None]:
trainset, y = data_model_cleansing(cleanset.copy())
trainset.info()

In [None]:
X_test, y_test = data_model_cleansing(test_cleanset.copy())
X_test.info()

In [None]:
tree = RandomForestClassifier(random_state=0)
logit = LogisticRegression(random_state=0)
xgb = XGBClassifier()
cat = CatBoostClassifier(random_state=0, verbose=0)

algos = [tree, logit, xgb, cat]

def ML(algo, x, y, res_dict = dict()):
    X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
    print(f'Training set size: {X_train.shape}, Validation set size: {X_valid.shape}')
    
    model = algo.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    train_acc = model.score(X_train, y_train)
    valid_acc = model.score(X_valid, y_valid)
    valid_prec = precision_score(y_true=y_valid, y_pred=y_pred, average='micro')
    valid_rec = recall_score(y_true=y_valid, y_pred=y_pred, average='micro')
    cross_val = cross_val_score(model, X_valid, y_valid, cv=3).mean()
    
    print(f'Algorithm name: {model.__class__.__name__}')
    print('-'*100)
    print(f'Training accuracy: {train_acc:.4f}, Validation accuracy: {valid_acc:.4f}')
    print(f'Validation precision score: {valid_prec:.4f}, Validation recall score: {valid_rec:.4f}')
    print(f'Cross validation score: {cross_val:.4f}')
    print('-'*100)
    print(classification_report(y_true=y_valid, y_pred=y_pred))
    print('-'*100)
    
    #plot confusion matrix and report
    plot_confusion_matrix(model, X_valid, y_valid, display_labels=model.classes_,
                         normalize='true', cmap=plt.cm.Blues, )
    plt.title('Normalized Confusion Matrix')
    plt.show()
    print('-'*100)
    res_dict[model.__class__.__name__+'_train_acc'] = train_acc
    res_dict[model.__class__.__name__+'_valid_acc'] = valid_acc
    res_dict[model.__class__.__name__+'_valid_prec'] = valid_prec
    res_dict[model.__class__.__name__+'_valid_rec'] = valid_rec
    res_dict[model.__class__.__name__+'_cross_val'] = cross_val
    return model, res_dict

def test_ML(model, X_test, y_test, res_dict):
    y_pred = model.predict(X_test)
    test_acc = model.score(X_test, y_test)
    test_prec = precision_score(y_true=y_test, y_pred=y_pred, average='micro')
    test_rec = recall_score(y_true=y_test, y_pred=y_pred, average='micro')
    cross_val = cross_val_score(model, X_test, y_test, cv=3).mean()
    
    print(f'Testing Algorithm name: {model.__class__.__name__}')
    print('-'*100)
    print(f'Testing accuracy: {test_acc:.4f}, Testing precision: {test_prec:.4f}, Testing recall: {test_rec:.4f}')
    print(f'Cross validation score: {cross_val:.4f}')
    print('-'*100)
    print(classification_report(y_true=y_test, y_pred=y_pred))
    print('-'*100)
    #plot confusion matrix and report
    plot_confusion_matrix(model, X_test, y_test, display_labels=model.classes_,
                         normalize='true', cmap=plt.cm.Blues, )
    plt.title('Normalized Confusion Matrix')
    plt.show()
    print('-'*100)
    res_dict[model.__class__.__name__+'_test_acc'] = test_acc
    res_dict[model.__class__.__name__+'_test_prec'] = test_prec
    res_dict[model.__class__.__name__+'_test_rec'] = test_rec
    res_dict[model.__class__.__name__+'_test_cross_val'] = cross_val
    return res_dict

In [None]:
res = dict()
for algo in algos:
    model, res = ML(algo, trainset, y)
    res = test_ML(model, X_test, y_test, res)

In [None]:
#show the top 10 important features on the catboost model. 
feature_list = pd.Series(model.feature_importances_, index=trainset.columns.values).sort_values(ascending=False).nlargest(10)
feature_list.sort_values().plot(kind='barh')
plt.title('Feature list')
plt.show()

In [None]:
[(key, value) for key, value in res.items() if key.startswith("Cat")]
#res

### Summary of data modeling
It can be found that all the performance are generally bad. By listing out the feature importance on catboost (or any other), we can see (1) Working_Experience (2) Profession (3) Graduated and (4) Ever_Married have contributed a significant part of the model. However, we do have a large proportion of data is filled by ourselves. 
Refer to the accuracy list above, CatBoost seems to be the best approach. However, would suggest to have a better sampling again for better data modeling.

### (Extra) Quick step by using pipeline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
trainset = df_raw.copy()
x_col = trainset.columns.values.tolist()
x_col.remove('Segmentation')
le = LabelEncoder()
trainset['Segmentation'] = le.fit_transform(trainset['Segmentation'])
X_train, X_test, y_train, y_test = train_test_split(trainset[x_col],
                                                   trainset['Segmentation'],
                                                   test_size=.2,
                                                   random_state=0)
print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')

In [None]:
class DataframeFunctionTransformer():
    def __init__(self, func):
        self.func = func
        
    def transform(self, input_df, **transform_params):
        return self.func(input_df)
    
    def fit(self, X, y=None, **fit_params):
        return self

def remove_id(input_df):
    if 'ID' in input_df.columns.values:
        input_df.drop(columns='ID', inplace=True)
        return input_df
    else:
        return input_df

In [None]:
# create pipeline
no_pipeline = Pipeline([
    ('fillno', SimpleImputer(strategy='constant', fill_value='No')),
    ('encode', OrdinalEncoder()),
])

common_pipeline = Pipeline([
    ('fillcommon', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder()),
])

num_pipeline = Pipeline([
    ('encode', SimpleImputer(strategy='median')),
    ('minmax', MinMaxScaler()),
])

rmid_pipeline = Pipeline([
    ('remove_id', DataframeFunctionTransformer(remove_id)),
])

preprocessing_pipeline = ColumnTransformer([
    ('step1,2', no_pipeline, ['Ever_Married', 'Graduated']),
    ('step3,4', common_pipeline, ['Profession', 'Var_1']),
    ('step5,6', num_pipeline, ['Work_Experience', 'Family_Size']),
    #Since Age has no empty rows, only minmax encoder works
    ('extra1', num_pipeline, ['Age']),
    # Since gender and spending score has no empty rows, only ordinal encoder works
    ('extra2', common_pipeline, ['Gender', 'Spending_Score']),
])

cat_pipeline = Pipeline([
    ('remove_id', rmid_pipeline),
    ('preprocessing', preprocessing_pipeline),
    ('catboost', CatBoostClassifier(random_state=0, verbose=0))
])

In [None]:
cat_pipeline.fit(X_train, y_train)

In [None]:
cat_pipeline.score(X_test, y_test)

In [None]:
testset = test_raw.copy()
le = LabelEncoder()
testset['Segmentation'] = le.fit_transform(testset['Segmentation'])
X_test0, y_test0 = testset[x_col], testset['Segmentation']
cat_pipeline.score(X_test0, y_test0)