## Step 1 - Project Problem statement

## Step 2 - Data Gathering

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.svm import SVC,SVR
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv(r"D:\python\Project\ML 8th Project\Social_Network_Ads.csv")
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0
...,...,...,...,...,...
395,15691863,Female,46.0,41000.0,1
396,15706071,Male,51.0,23000.0,1
397,15654296,Female,50.0,20000.0,1
398,15755018,Male,36.0,33000.0,0


In [3]:
df.shape # shape of dataframe

(400, 5)

## Step 3 - Data Cleaning

In [4]:
df.duplicated(keep = "first").value_counts() # check out the duplicate value in dataframe

False    400
dtype: int64

In [5]:
# total number of rows = 400 and duplicate value count is also 400
# so we can say,there is no duplicate value available in data set.

In [6]:
# let's check the values present in the purchased 

df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [7]:
# 0 = 257 & 1 = 143 it shows imbalance structure of data.

## Step 4 - EDA

In [8]:
# df.describe()
# Check out the discription of dataset,
# it shows the count, mean and standard deviation of all features separately with 
# minimum and maximum value available in that feature

In [9]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [10]:
df.isna().sum() # check null value count of every feature

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [11]:
# it clearly indicates that there is no null value in dataset.

### SMOTE (Data Balancing)

In [12]:
# df.info()
# It gives the information of dataset
# It shows the non null count and datatypes of every feature

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   User ID          400 non-null    int64  
 1   Gender           400 non-null    object 
 2   Age              400 non-null    float64
 3   EstimatedSalary  400 non-null    float64
 4   Purchased        400 non-null    int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 15.8+ KB


In [13]:
# User ID is irrevent column

df.drop("User ID", axis=1, inplace=True)

In [14]:
df["Gender"].value_counts()

Female    204
Male      196
Name: Gender, dtype: int64

In [15]:
df['Gender'].replace({'Female':0,'Male':1}, inplace=True)

In [16]:
from imblearn.over_sampling import SMOTE

In [17]:
x = df.drop("Purchased", axis = 1)
y = df["Purchased"]

In [18]:
smt = SMOTE()
x_sample, y_sample = smt.fit_resample(x,y)
y_sample.value_counts()

0    257
1    257
Name: Purchased, dtype: int64

In [19]:
x_sample # independent feature

Unnamed: 0,Gender,Age,EstimatedSalary
0,1,19.000000,19000.000000
1,1,35.000000,20000.000000
2,0,26.000000,43000.000000
3,0,27.000000,57000.000000
4,1,19.000000,76000.000000
...,...,...,...
509,0,46.844083,89307.988172
510,0,52.897159,95443.725345
511,0,53.769783,41554.984535
512,0,28.960439,136215.824541


In [20]:
y_sample # dependent feature

0      0
1      0
2      0
3      0
4      0
      ..
509    1
510    1
511    1
512    1
513    1
Name: Purchased, Length: 514, dtype: int64

## Data Splitting

In [21]:
# Split the dependent as well as independent feature as a train and test data

x_train,x_test,y_train,y_test = train_test_split(x_sample,y_sample,test_size=0.25,random_state=1)

In [22]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

(385, 3) (129, 3) (385,) (129,)


In [23]:
x_train

Unnamed: 0,Gender,Age,EstimatedSalary
173,0,34.0,43000.0
80,1,30.0,80000.0
46,1,25.0,79000.0
246,0,35.0,50000.0
93,0,29.0,28000.0
...,...,...,...
255,0,52.0,90000.0
72,0,20.0,23000.0
396,1,51.0,23000.0
235,1,46.0,79000.0


In [24]:
x_test

Unnamed: 0,Gender,Age,EstimatedSalary
47,0,27.000000,54000.000000
346,1,53.000000,72000.000000
284,1,48.000000,141000.000000
221,1,35.000000,91000.000000
505,1,36.078179,144000.000000
...,...,...,...
201,1,49.000000,74000.000000
291,1,49.000000,89000.000000
326,1,41.000000,72000.000000
295,0,36.000000,63000.000000


## x_sample

In [25]:
df.columns # Checking columns/features in the dataset

Index(['Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

## Step 5 - Feature Engineering

## 1.Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
SS = StandardScaler()
x_train = SS.fit_transform(x_train)
x_test = SS.transform(x_test)

### 2. Outlier Handling

In [28]:
import warnings
warnings.filterwarnings('ignore')

## Step 6 - Feature Engineering

## Step 7 - Model Training

In [29]:
def train_model(model_name, x_train,x_test,y_train,y_test):
    """This function is for model trainingn"""    
    model_name.fit(x_train,y_train)   ### Model Training
    
    
    ############### model evaluation 
    
    ########### Test Data Evaluation 
    print('#'*50)
    print(f"TESTING DATA EVALUATION")
    y_pred_test = model_name.predict(x_test)
    acc_score = accuracy_score(y_test,y_pred_test)
    cnf_matrix = confusion_matrix(y_test,y_pred_test)
    clf_report = classification_report(y_test,y_pred_test)
    
    print(f"Accuracy_Score = {acc_score}")
    print(f"Confusion Matrix = \n{cnf_matrix}")
    print(f"Classification Report = \n{clf_report}")
    
    print('#'*50)
    print(f"TRAINING DATA EVALUATION")
    print()
    print()
    ########### training Data Evaluation 
    y_pred_train = model_name.predict(x_train)
    acc_score = accuracy_score(y_train,y_pred_train)
    cnf_matrix = confusion_matrix(y_train,y_pred_train)
    clf_report = classification_report(y_train,y_pred_train)
    
    print(f"Accuracy_Score = {acc_score}")
    print(f"Confusion Matrix = \n{cnf_matrix}")
    print(f"Classification Report = \n{clf_report}")
    
    return "Success"

## Step 8 - Model Training

### 1. Logestic Regression

In [30]:
lgr_model = LogisticRegression(max_iter=500)

train_model(lgr_model,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.8604651162790697
Confusion Matrix = 
[[49  6]
 [12 62]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.80      0.89      0.84        55
           1       0.91      0.84      0.87        74

    accuracy                           0.86       129
   macro avg       0.86      0.86      0.86       129
weighted avg       0.87      0.86      0.86       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.8363636363636363
Confusion Matrix = 
[[172  30]
 [ 33 150]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       202
           1       0.83      0.82      0.83       183

    accuracy                           0.84       385
   macro avg       0.84      0.84      0.84       385
weighted avg       0.84      0.84      0.84 

'Success'

## 2. K nearest Neighbour

In [31]:
knn_model = KNeighborsClassifier()
train_model(knn_model,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9069767441860465
Confusion Matrix = 
[[47  8]
 [ 4 70]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.92      0.85      0.89        55
           1       0.90      0.95      0.92        74

    accuracy                           0.91       129
   macro avg       0.91      0.90      0.90       129
weighted avg       0.91      0.91      0.91       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9298701298701298
Confusion Matrix = 
[[185  17]
 [ 10 173]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       202
           1       0.91      0.95      0.93       183

    accuracy                           0.93       385
   macro avg       0.93      0.93      0.93       385
weighted avg       0.93      0.93      0.93 

'Success'

## 2.1 Hyper Parameter - KNN

In [32]:
hyp= {'n_neighbors':np.arange(3,20),'p':[1,2]}

rscv_knn = RandomizedSearchCV(knn_model,hyp, cv=7)
rscv_knn.fit(x_train,y_train)

In [33]:
knn_model_hyp = rscv_knn.best_estimator_
knn_model_hyp.fit(x_train,y_train)

In [34]:
train_model(knn_model_hyp,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.8992248062015504
Confusion Matrix = 
[[48  7]
 [ 6 68]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.89      0.87      0.88        55
           1       0.91      0.92      0.91        74

    accuracy                           0.90       129
   macro avg       0.90      0.90      0.90       129
weighted avg       0.90      0.90      0.90       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9246753246753247
Confusion Matrix = 
[[184  18]
 [ 11 172]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       202
           1       0.91      0.94      0.92       183

    accuracy                           0.92       385
   macro avg       0.92      0.93      0.92       385
weighted avg       0.93      0.92      0.92 

'Success'

## 3. Decision Tree

In [35]:
dt_model = DecisionTreeClassifier(random_state=10)
train_model(dt_model,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.8914728682170543
Confusion Matrix = 
[[48  7]
 [ 7 67]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.87      0.87      0.87        55
           1       0.91      0.91      0.91        74

    accuracy                           0.89       129
   macro avg       0.89      0.89      0.89       129
weighted avg       0.89      0.89      0.89       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9974025974025974
Confusion Matrix = 
[[202   0]
 [  1 182]]
Classification Report = 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       202
           1       1.00      0.99      1.00       183

    accuracy                           1.00       385
   macro avg       1.00      1.00      1.00       385
weighted avg       1.00      1.00      1.00 

'Success'

### 3.1 Hyper Parameter - Decision Tree 

In [36]:
hyp= {'criterion':["gini","entropy"],
     'max_depth':np.arange(3,20),
     'min_samples_split':np.arange(2,20),
     'min_samples_leaf':np.arange(2,10)
     }

rscv = RandomizedSearchCV(dt_model,hyp, cv= 5)
rscv.fit(x_train,y_train)

In [37]:
dt_model_hyp = rscv.best_estimator_
dt_model_hyp.fit(x_train,y_train)

In [38]:
train_model(dt_model_hyp,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9224806201550387
Confusion Matrix = 
[[49  6]
 [ 4 70]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.92      0.89      0.91        55
           1       0.92      0.95      0.93        74

    accuracy                           0.92       129
   macro avg       0.92      0.92      0.92       129
weighted avg       0.92      0.92      0.92       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9272727272727272
Confusion Matrix = 
[[187  15]
 [ 13 170]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       202
           1       0.92      0.93      0.92       183

    accuracy                           0.93       385
   macro avg       0.93      0.93      0.93       385
weighted avg       0.93      0.93      0.93 

'Success'

## 4. Random Forest

In [39]:
rf_model = RandomForestClassifier(random_state=2)
train_model(rf_model,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9302325581395349
Confusion Matrix = 
[[50  5]
 [ 4 70]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92        55
           1       0.93      0.95      0.94        74

    accuracy                           0.93       129
   macro avg       0.93      0.93      0.93       129
weighted avg       0.93      0.93      0.93       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9974025974025974
Confusion Matrix = 
[[201   1]
 [  0 183]]
Classification Report = 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       202
           1       0.99      1.00      1.00       183

    accuracy                           1.00       385
   macro avg       1.00      1.00      1.00       385
weighted avg       1.00      1.00      1.00 

'Success'

### 4.1 Hyper Parameter - Random Forest

In [40]:
hyp= {'criterion':["gini","entropy"],
      "n_estimators": np.arange(10,400,10),
     'max_depth':np.arange(2,10),
     'min_samples_split':np.arange(2,20),
     'min_samples_leaf':np.arange(2,10)
     }

rscv_rf = RandomizedSearchCV(rf_model,hyp, cv= 5)
rscv_rf.fit(x_train,y_train)

In [41]:
rf_model_hyp = rscv_rf.best_estimator_
rf_model_hyp.fit(x_train,y_train)

In [42]:
train_model(rf_model_hyp,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.937984496124031
Confusion Matrix = 
[[49  6]
 [ 2 72]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.96      0.89      0.92        55
           1       0.92      0.97      0.95        74

    accuracy                           0.94       129
   macro avg       0.94      0.93      0.94       129
weighted avg       0.94      0.94      0.94       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9272727272727272
Confusion Matrix = 
[[183  19]
 [  9 174]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       202
           1       0.90      0.95      0.93       183

    accuracy                           0.93       385
   macro avg       0.93      0.93      0.93       385
weighted avg       0.93      0.93      0.93  

'Success'

## 5. Support Vector Machine

In [43]:
svc_model = SVC(random_state=1)
train_model(svc_model,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.9224806201550387
Confusion Matrix = 
[[49  6]
 [ 4 70]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.92      0.89      0.91        55
           1       0.92      0.95      0.93        74

    accuracy                           0.92       129
   macro avg       0.92      0.92      0.92       129
weighted avg       0.92      0.92      0.92       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9090909090909091
Confusion Matrix = 
[[182  20]
 [ 15 168]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       202
           1       0.89      0.92      0.91       183

    accuracy                           0.91       385
   macro avg       0.91      0.91      0.91       385
weighted avg       0.91      0.91      0.91 

'Success'

## Final Model Selection

In [44]:
train_model(rf_model_hyp,x_train, x_test, y_train,y_test)

##################################################
TESTING DATA EVALUATION
Accuracy_Score = 0.937984496124031
Confusion Matrix = 
[[49  6]
 [ 2 72]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.96      0.89      0.92        55
           1       0.92      0.97      0.95        74

    accuracy                           0.94       129
   macro avg       0.94      0.93      0.94       129
weighted avg       0.94      0.94      0.94       129

##################################################
TRAINING DATA EVALUATION


Accuracy_Score = 0.9272727272727272
Confusion Matrix = 
[[183  19]
 [  9 174]]
Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       202
           1       0.90      0.95      0.93       183

    accuracy                           0.93       385
   macro avg       0.93      0.93      0.93       385
weighted avg       0.93      0.93      0.93  

'Success'

In [45]:
import pickle
model = pickle.dump(rf_model_hyp,open('Social_Network_Ads_model.pkl','wb'))

## User Define Function

In [46]:
def prediction(user_input_data):
    input_data = SS.transform(user_input_data)
    result = rf_model_hyp.predict(input_data)
    if result==0:
        return ("Purchase Result = Not Purchased")
    else:
        return ("Purchase Result = Purchased")

In [47]:
df.iloc[[47]]

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
47,0,27.0,54000.0,0


In [48]:
user_input_data=[[0,27,5400]]

In [49]:
prediction(user_input_data)

'Purchase Result = Not Purchased'

In [50]:
df_new = df.drop("Purchased",axis=1)
columns = df_new.columns
print(len(columns))
columns.to_list()

3


['Gender', 'Age', 'EstimatedSalary']

In [51]:
columns_dict = {"col_name": columns.to_list()}
columns_dict

{'col_name': ['Gender', 'Age', 'EstimatedSalary']}

In [52]:
import json

In [53]:
with open('columns_name.json','w') as json_file:
    json.dump(columns_dict,json_file)