In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the training data

In [None]:
training_data = pd.read_csv("/kaggle/input/training-data/train_data.csv")
training_data.head()

# Removing unnecessary columns

In [None]:
training_data = training_data.drop(["Unnamed: 0","CustomerID"], axis=1)
training_data.head()

# Checking for missing values

In [None]:
training_data.isna().sum()

# no  missing values found

In [None]:
training_data.shape

In [None]:
training_data.info()

In [None]:
training_data.describe()

In [None]:
training_data.groupby("Class").size()

# 1. Exploratory Data Analysis

## 1.1. Gender

In [None]:
import seaborn as sns

sns.countplot(x="Class", hue="Genre", data=training_data)

**This shows that females are more dominant in all three classes indicating that they go shopping more than males do**

## 1.2. Annual Income

Seeing that the customers are divided into **three classes**,<br>
the data will be divided into three equal sections based on annual income to see if a certain class is more prevalent in a certain range of annual income.<br>
To divide the data into 3 equal section based on income, 2 quantiles are calculated. <br>


In [None]:
print(training_data["Annual Income (k$)"].quantile(.33))
print(training_data["Annual Income (k$)"].quantile(.66))

This indicates that:
1. 1/3 of the customers have annual income less than or equal to 46k
2. 1/3 of the customers have annual income between 46k and 71k
3. 1/3 of the customers have annual income greater than 71k

In [None]:
third1 = training_data.loc[training_data["Annual Income (k$)"] <= 46]
sns.countplot(x="Class", data=third1)

In [None]:
third2 = training_data.loc[(training_data["Annual Income (k$)"] > 46)&(training_data["Annual Income (k$)"] <= 71)]
sns.countplot(x="Class", data=third2)

In [None]:
third3 = training_data.loc[(training_data["Annual Income (k$)"] > 71)]
sns.countplot(x="Class", data=third3)

**Based on the above three graphs we can see that:**
* The is no prevalent class between customers whose annual incomes are less than 46k
* Class 2 is prevalent betweenn middle-class customers whose annual incomes are between 46k and 71k
* Class 2 is not very present between upper-class customers whose annual incomes greater than 71k

## **1.3. Age**

### **We can take the same approach with Age as with Annual Income**

In [None]:
print(training_data["Age"].quantile(.33))
print(training_data["Age"].quantile(.66))

In [None]:
third1 = training_data.loc[training_data["Age"] <= 31]
sns.countplot(x="Class", data=third1)

In [None]:
third2 = training_data.loc[(training_data["Age"] > 31) & (training_data["Age"] < 45)]
sns.countplot(x="Class", data=third2)

In [None]:
third3 = training_data.loc[training_data["Age"] > 45]
sns.countplot(x="Class", data=third3)

**Based on the above three graphs we can see that:**
* Class 1 has a weaker prescence between younger ages indicating that they tend to spend more than others.
* All classes are present between middle aged people
* Class 3 is not present between older customers indicating that they never spend a lot of money on shopping

# 1.4 Age and Annual Income

In [None]:
cor_mat = training_data[['Annual Income (k$)' , 'Age']].corr()

# Custom cmap pallete
cmap = sns.diverging_palette(0 , 200 , as_cmap=True)

# Building heatmap
sns.heatmap(cor_mat ,vmax=.3 ,annot=True, center=0 , cmap=cmap , square=True , linewidths=.5 , cbar_kws={'shrink': .5})

We can see from the correlation mat that there is no correlation between between age and annual income **(Notice that Gender is not present beacuse it is a categorical feature)**

# 2. Preparing the data for making the model

**Replacing Male/Female in the gender column into 0/1 beacuse random forests/decison tree deal with numerical features only and not categorical**

In [None]:
training_data["Genre"].replace("Male", 0, inplace = True)
training_data["Genre"].replace("Female", 1, inplace = True)

In [None]:
training_data.head()

# **Separating the features (X) from the class labels (Y)**

In [None]:
X = training_data.drop("Class",axis=1)
Y = training_data["Class"]

In [None]:
X

In [None]:
pd.DataFrame(Y)

# **Splitting the data (80/20 ratio)**

A ratio of 80/20 is used for data splitting such that 80% goes to the training subset and 20% to the testing subset.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=7, stratify=Y)

**Examine the data dimension**

In [None]:
X_train.shape, Y_train.shape

In [None]:
X_test.shape, Y_test.shape

# **3. Building a machine learning model using Random Forest Classifier**

In [None]:
from sklearn import ensemble
from sklearn.metrics import accuracy_score

model = ensemble.RandomForestClassifier()


In [None]:
model.fit(X_train, Y_train)

In [None]:
Y_pred = model.predict(X_test)
Y_pred.shape

In [None]:
accuracy_score(Y_pred, Y_test)

# **Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

max_features_range = np.arange(1,4,1)
n_estimators_range = np.arange(1,50)
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

model = ensemble.RandomForestClassifier()

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)

In [None]:
grid.fit(X_train, Y_train)

In [None]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [None]:
Y_pred = grid.best_estimator_.predict(X_test)
accuracy_score(Y_pred, Y_test)
# Increased accuracy

**Dataframe of Grid search parameters and their Accuracy scores**

In [None]:
import pandas as pd

grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()

**Pivoting the data**

In [None]:
grid_pivot = grid_results.pivot('max_features', 'n_estimators')
grid_pivot

**Preparing X Y Z of countour plot**

In [None]:
x = grid_pivot.columns.levels[1].values
y = grid_pivot.index.values
z = grid_pivot.values

**2D contour plot**

In [None]:
import plotly.graph_objects as go

# X and Y axes labels
layout = go.Layout(
            xaxis=go.layout.XAxis(
              title=go.layout.xaxis.Title(
              text='n_estimators')
             ),
             yaxis=go.layout.YAxis(
              title=go.layout.yaxis.Title(
              text='max_features') 
            ) )

fig = go.Figure(data = [go.Contour(z=z, x=x, y=y)], layout=layout )

fig.update_layout(title='Hyperparameter tuning', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

**3D contour plot**

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout )
fig.update_layout(title='Hyperparameter tuning',
                  scene = dict(
                    xaxis_title='n_estimators',
                    yaxis_title='max_features',
                    zaxis_title='Accuracy'),
                  autosize=False,
                  width=800, height=800,
                  margin=dict(l=65, r=50, b=65, t=90))
fig.show()

# **Loading and preparing test data**

In [None]:
X_test = pd.read_csv("/kaggle/input/test-data/test_data.csv")
X_test = X_test.drop(["Unnamed: 0","CustomerID"], axis=1)
X_test["Genre"].replace("Male", 0, inplace = True)
X_test["Genre"].replace("Female", 1, inplace = True)
X_test.shape

In [None]:
Y_pred = grid.best_estimator_.predict(X_test)