In [1]:
import os
import sys
import sklearn
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from scipy import stats
from random import seed
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,StandardScaler,PolynomialFeatures,scale
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.model_selection import cross_val_score,train_test_split,KFold,GridSearchCV
from sklearn.base import BaseEstimator,ClassifierMixin
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,classification_report,roc_auc_score,roc_curve,auc,r2_score,mean_squared_error
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv('Customer.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
df.set_index('User ID',inplace=True)

In [4]:
df['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 15624510 to 15594041
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   EstimatedSalary  400 non-null    int64 
 3   Purchased        400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 15.6+ KB


In [6]:
df.isnull().sum()

Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [7]:
sorted(df['Age'].unique())

[18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60]

In [8]:
df['Age'] = pd.cut(df['Age'], 
                                bins = [17, 30, 45, 61], 
                                labels = ['Young', 'Adult', 'Old'])



In [9]:
df["Age"].unique()

['Young', 'Adult', 'Old']
Categories (3, object): ['Young' < 'Adult' < 'Old']

In [10]:
df.head(1)

Unnamed: 0_level_0,Gender,Age,EstimatedSalary,Purchased
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15624510,Male,Young,19000,0


In [11]:
ls = LabelEncoder()
df['Age'] = ls.fit_transform(df['Age'])
df['Gender'] = ls.fit_transform(df['Gender'])

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 15624510 to 15594041
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   Gender           400 non-null    int32
 1   Age              400 non-null    int32
 2   EstimatedSalary  400 non-null    int64
 3   Purchased        400 non-null    int64
dtypes: int32(2), int64(2)
memory usage: 12.5 KB


In [13]:
ss = StandardScaler()
EstimatedSalary = df[['EstimatedSalary']]
s = ss.fit_transform(EstimatedSalary)
df['EstimatedSalary'] = s
df.head(1)

Unnamed: 0_level_0,Gender,Age,EstimatedSalary,Purchased
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15624510,1,2,-1.490046,0


In [14]:
df.head(1)

Unnamed: 0_level_0,Gender,Age,EstimatedSalary,Purchased
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15624510,1,2,-1.490046,0


In [15]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

In [16]:
X = df.drop(columns='Purchased')
y = df[["Purchased"]]

In [17]:
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.3,random_state=42)

In [18]:
X_train_smote,y_train_smote = smote.fit_resample(X,y)

In [19]:
print("After SMOTE: ", y_train_smote["Purchased"].value_counts())

After SMOTE:  0    257
1    257
Name: Purchased, dtype: int64


In [20]:
svclassifier = SVC(kernel="rbf",degree=10)
svclassifier.fit(X_train_smote,np.ravel(y_train_smote))

In [21]:
yhat_svm = svclassifier.predict(X_test)
print(classification_report(y_test, yhat_svm))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95        73
           1       0.92      0.94      0.93        47

    accuracy                           0.94       120
   macro avg       0.94      0.94      0.94       120
weighted avg       0.94      0.94      0.94       120

