In [1]:
#### Dataset Link - https://drive.google.com/file/d/1OUbjZvFJT6yrLtZAJBsv7BCPttX2-Ru8/view

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("/content/Social_Network_Ads.csv")
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
data['Purchased'].unique()

array([0, 1])

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [6]:
gender = pd.get_dummies(data["Gender"],drop_first=True)
gender

Unnamed: 0,Male
0,1
1,1
2,0
3,0
4,1
...,...
395,0
396,1
397,0
398,1


In [7]:
data['Gender'] = gender
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0


In [8]:
data.describe()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0,400.0
mean,15691540.0,0.49,37.655,69742.5,0.3575
std,71658.32,0.500526,10.482877,34096.960282,0.479864
min,15566690.0,0.0,18.0,15000.0,0.0
25%,15626760.0,0.0,29.75,43000.0,0.0
50%,15694340.0,0.0,37.0,70000.0,0.0
75%,15750360.0,1.0,46.0,88000.0,1.0
max,15815240.0,1.0,60.0,150000.0,1.0


In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['Age'] = scaler.fit_transform(data[['Age']])
data['EstimatedSalary'] = scaler.fit_transform(data[['EstimatedSalary']])

In [10]:
data.describe()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0,400.0
mean,15691540.0,0.49,-7.105427000000001e-17,-1.7763570000000002e-17,0.3575
std,71658.32,0.500526,1.001252,1.001252,0.479864
min,15566690.0,0.0,-1.877311,-1.607506,0.0
25%,15626760.0,0.0,-0.7550313,-0.7852897,0.0
50%,15694340.0,0.0,-0.0625611,0.007561451,0.0
75%,15750360.0,1.0,0.7970571,0.5361289,1.0
max,15815240.0,1.0,2.134241,2.35675,1.0


In [11]:
x = data[['Gender','Age','EstimatedSalary']]   ## feature
y = data['Purchased']   ## target

In [12]:
print(x.shape)
print(y.shape)

(400, 3)
(400,)


In [13]:
y.value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [14]:
#### HoldOut Validation Approach

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size=0.8,random_state=0)
model = LogisticRegression()
model.fit(xtrain,ytrain)
result = model.score(xtest,ytest)
print(result)

0.9125


In [16]:
### K-Fold Cross Validation
from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
kfold = KFold(10)
results = cross_val_score(model,x,y,cv=kfold)
print(results)
print(np.mean(results))

[0.675 0.9   0.925 0.9   0.925 0.725 0.925 0.75  0.925 0.6  ]
0.825


In [17]:
### Stratified K-Fold Cross Validation
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
skfold = StratifiedKFold(10)
results = cross_val_score(model,x,y,cv=skfold)
print(results)
print(np.mean(results))

[0.675 0.675 0.975 0.95  1.    0.9   0.8   0.775 0.8   0.7  ]
0.825


In [18]:
### Leave One Out Cross Validation (LOOCV)
from sklearn.model_selection import LeaveOneOut,cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
loocv = LeaveOneOut()
results = cross_val_score(model,x,y,cv=loocv)
print(results)
print(np.mean(results))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1.
 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1.

In [19]:
### Repeated Random Train Test Split (Shuffle Split)
from sklearn.model_selection import ShuffleSplit,cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
model = LogisticRegression()
shuffle = ShuffleSplit(n_splits=10,train_size=0.8,random_state=0)
results = cross_val_score(model,x,y,cv=shuffle)
print(results)
print(np.mean(results))

[0.9125 0.7875 0.8375 0.8125 0.8125 0.8    0.8    0.8875 0.875  0.9   ]
0.8424999999999999
