# Problem : 
### Given Some data of the customers of a mall that contain the following:
1. CustomerID, which is the customer's ID number.
2. Male / Female
3. Age, which is the age of the customer.
4. Annual Income, which is the customer's annual salary.
5. Class. The mall classifies customers into three categories according to their spending in the mall, in order to be able to adequately market each category. Tier 1 is the least spender, Tier 2 is the average spender, and Tier 3 spends the most.
### We need to creat a model that predict each customer category.
# What we will cover :
## 1. Exploratory Data analysis
## 2. Data cleaning and preprocessing
## 3. Model building, evaluation and predicting the target of the test Data.


In [None]:
# importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform, truncnorm, randint
from sklearn.model_selection import RandomizedSearchCV

#ignore warning messages
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# loading the data
df_train = pd.read_csv("/kaggle/input/fcdscompetition/data.csv")
df_test = pd.read_csv("/kaggle/input/fcdscompetition/test_data (1).csv")

# EDA and Data cleaning

In [None]:
# show first 5 rows
df_train.head()

In [None]:
# Information about the data
df_train.info()

Great! we have no nulls. we need to drop (unnamed: 0) column and rename our columns (Genre, Annual Income (K$))

In [None]:
# dropping column
df_train.drop("Unnamed: 0", axis=1, inplace=True)
df_test.drop("Unnamed: 0", axis=1, inplace=True)

#rename columns
df_train.rename(columns={'Genre': 'Gender', 'Annual Income (k$)': 'Annual_Income_k$'}, inplace=True)
df_test.rename(columns={'Genre': 'Gender', 'Annual Income (k$)': 'Annual_Income_k$'}, inplace=True)

In [None]:
# target Class Distribution
sns.countplot(df_train.Class, palette="Set2");

most customers in the data are from 2nd class, 1st and 3rd are slightly equall.

In [None]:
# Gender distribution
sns.countplot(df_train.Gender, palette="husl");

females are about 60% of the data while males are 40%

In [None]:
# Gender distribution to each class
sns.countplot("Gender", hue="Class", data=df_train, palette="Paired");

In [None]:
# Age distribution 
sns.histplot(df_train.Age);

most customers ages from 20 to 50.

In [None]:
# age boxplot
sns.boxplot(df_train["Age"])

In [None]:
# Male age distribution
sns.histplot(df_train[df_train.Gender == 'Male'].Age);

In [None]:
# Female age distribution
sns.histplot(df_train[df_train.Gender == 'Female'].Age);

more males ages about 20->25 while more females ages about 30->35

In [None]:
# Annual income distribution
sns.histplot(df_train["Annual_Income_k$"]);

Most customers make  50k->80K annually

In [None]:
# Income boxplot
sns.boxplot(df_train["Annual_Income_k$"]);

In [None]:
# Average income for each gender
print(df_train.groupby("Gender")["Annual_Income_k$"].mean())
df_train.groupby("Gender")["Annual_Income_k$"].mean().plot(kind = "bar");

Both Gender average incomes are equall.

In [None]:
# Age and income relation
sns.regplot(df_train["Age"], df_train["Annual_Income_k$"]);

There is no clear direct relation but our Data shows that customers at middle ages have more annual income.

In [None]:
# Class and Income 
sns.scatterplot(df_train["Class"], df_train["Annual_Income_k$"])

Customers whose income about 60k (Average Income) tends to be in 2nd class (Average Spender)

# Data Preprocessing 

In this part I'll do the following:
1. encode categorical features (Gender)
2. drop customerID as it is unique for each one
3. normalize Income and age columns

In [None]:
# function to preprocess our data
def preprocess(data):
           
    # encode gender col
    data.Gender.replace({"Male":0, "Female":1}, inplace=True)
    
    # drop customer id
    data.drop("CustomerID", axis=1, inplace=True)
    
    # Normalize Annual income and age
    scaler_income = MinMaxScaler()
    data["Annual_Income_k$"] = scaler_income.fit_transform(data["Annual_Income_k$"].values.reshape(-1,1))
    scaler_age = MinMaxScaler()
    data["Age"] = scaler_age.fit_transform(data["Age"].values.reshape(-1,1))

In [None]:
# preprocess data
preprocess(df_train)
df_train.head()

# Model building

In [None]:
# split the data 
X = df_train.drop("Class", axis=1)
y = df_train["Class"]

# dict to contain our models
models = {"KNN": KNeighborsClassifier(), 
          "RFC": RandomForestClassifier(),
          "LR": LogisticRegression(),
          "GBC":GradientBoostingClassifier(),
          }


# function for training and evaluating given models
def train_and_evaluate(models, X, y):
    
    scores = {}
    for name, model in models.items():
        scores[name] =  cross_val_score(model, X, y, cv=5)
    
    print(pd.DataFrame(scores))

In [None]:
# training and evaluating the models
train_and_evaluate(models, X, y);

Random forest classifier has the best results.

In [None]:
# Tuning Hyperparameters
model_params = {
    'n_estimators': randint(4,200),
    'max_features': truncnorm(a=0, b=1, loc=0.25, scale=0.1),
    'min_samples_split': uniform(0.01, 0.199)
}

rfc = RandomForestClassifier()

# set up random search 
clf = RandomizedSearchCV(rfc, model_params, n_iter=100, cv=5, random_state=1)

# train the random search to find the best model
model = clf.fit(X, y)
print(model.score(X,y))

# print winning set of hyperparameters
from pprint import pprint
pprint(model.best_estimator_.get_params())

In [None]:
# Create prediction dataframe
sub = pd.DataFrame()
sub["CustomerID"] = df_test["CustomerID"]

In [None]:
# preprocessing test data
preprocess(df_test)

In [None]:
# predict the classes of test set
predictions = model.predict(df_test)

In [None]:
sub["Class"] = predictions
sub

In [None]:
sub.to_csv("sub.csv", index=False)