## Classification

### Import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Get the data from dataset

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
print(df)

      User ID  Gender  Age  EstimatedSalary  Purchased
0    15624510    Male   19            19000          0
1    15810944    Male   35            20000          0
2    15668575  Female   26            43000          0
3    15603246  Female   27            57000          0
4    15804002    Male   19            76000          0
..        ...     ...  ...              ...        ...
395  15691863  Female   46            41000          1
396  15706071    Male   51            23000          1
397  15654296  Female   50            20000          1
398  15755018    Male   36            33000          0
399  15594041  Female   49            36000          1

[400 rows x 5 columns]


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [4]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


### EDA

In [5]:
df.cov()

  df.cov()


Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
User ID,5134915000.0,-541.68287,173714300.0,244.836284
Age,-541.6829,109.890702,55487.38,3.131165
EstimatedSalary,173714300.0,55487.380952,1162603000.0,5924.367168
Purchased,244.8363,3.131165,5924.367,0.230269


In [6]:
df.corr()

  df.corr()


Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
User ID,1.0,-0.000721,0.071097,0.00712
Age,-0.000721,1.0,0.155238,0.622454
EstimatedSalary,0.071097,0.155238,1.0,0.362083
Purchased,0.00712,0.622454,0.362083,1.0


### data cleansing

In [7]:
df.isna().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

### prepare the data

In [16]:
# decide the independent variable(s)
x = df[['Age', 'EstimatedSalary']]
# decide the dependent variable
y = df['Purchased']

### Split the data into train and test

In [65]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=123456)

### Create the model using linear regression

In [66]:
from sklearn.linear_model import LinearRegression

# create empty model
model = LinearRegression()

# train the model
model.fit(x_train, y_train)

### Evaluate the model using Linear Regression

In [67]:
# consider the y_test as y_true
y_true = y_test

# predict the values or x_test
y_pred_temp = model.predict(x_test)

y_pred_list = []
for value in y_pred_temp:
    if value >= 0.5:
        y_pred_list.append(1)
    else:
        y_pred_list.append(0)
        
y_pred = np.array(y_pred_list)

In [68]:
y_true

315    1
183    0
197    0
335    0
357    0
      ..
243    1
253    1
181    0
12     0
24     1
Name: Purchased, Length: 80, dtype: int64

In [69]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0])

In [70]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
cm

array([[55,  1],
       [ 8, 16]], dtype=int64)

In [71]:
accuarcy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
accuarcy

0.8875

In [72]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
accuracy

0.8875

In [73]:
tn, fp, fn , tp = cm.ravel()
print(tn)
print(fp)
print(fn)
print(tp)

55
1
8
16


### Create the model using Logistic RegressionCV

In [74]:
from sklearn.linear_model import LogisticRegressionCV

# create empty model
model = LogisticRegressionCV()

# train the model
model.fit(x_train, y_train)

### Evaluate the model using Logistic RegressionCV

In [75]:
# consider the y_test as y_true
y_true = y_test

# predict the values or x_test
y_pred = model.predict(x_test)

In [76]:
y_true

315    1
183    0
197    0
335    0
357    0
      ..
243    1
253    1
181    0
12     0
24     1
Name: Purchased, Length: 80, dtype: int64

In [77]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0], dtype=int64)

In [78]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)
cm

array([[55,  1],
       [10, 14]], dtype=int64)

In [79]:
accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
accuracy

0.8625

In [80]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_true, y_pred)
accuracy

0.8625

In [81]:
tn, fp, fn, tp = cm.ravel()
print(tn)
print(fp)
print(fn)
print(tp)

55
1
10
14
