### IMPORT NECESSARY LIBRARIES

In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import plotly.express as px

### READ DATA

In [44]:
df = pd.read_csv(r"C:\Users\Black Concept\WorkSpace\HAMOYE TAG ALONG CODES\HAMOYE--TAG-ALONG-CODES\PRACTISE CODES\Datasets\Social_Network_Ads.csv")

# Drop User ID
df.drop('User ID', axis=1, inplace= True)

### BRIEF EDA

In [45]:
# Check the first 5 rows
df.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [46]:
# Check the brief info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Gender           400 non-null    object
 1   Age              400 non-null    int64 
 2   EstimatedSalary  400 non-null    int64 
 3   Purchased        400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 12.6+ KB


In [47]:
# Statistical summary
df.describe()

Unnamed: 0,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0
mean,37.655,69742.5,0.3575
std,10.482877,34096.960282,0.479864
min,18.0,15000.0,0.0
25%,29.75,43000.0,0.0
50%,37.0,70000.0,0.0
75%,46.0,88000.0,1.0
max,60.0,150000.0,1.0


The data has been cleaned

### DATA DISTRIBUTION

In [48]:
# Age Distribution
fig_age = px.histogram(df, x='Age', nbins=10, title='Age Distribution')
fig_age.show()

# Estimated Salary Distribution
fig_salary = px.histogram(df, x='EstimatedSalary', nbins=10, title='Estimated Salary Distribution')
fig_salary.show()

# Gender Distribution
gender_counts = df['Gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']  # Renaming columns for clarity
fig_gender = px.bar(gender_counts, x='Gender', y='Count', title='Gender Distribution')
fig_gender.update_layout(xaxis_title='Gender', yaxis_title='Count')
fig_gender.show()

# Purchase Distribution
purchase_counts = df['Purchased'].value_counts().reset_index()
purchase_counts.columns = ['Purchase', 'Count']  # Renaming columns for clarity
fig_purchase = px.bar(purchase_counts, x='Purchase', y='Count', title='Purchase Distribution')
fig_purchase.update_layout(xaxis_title='Purchase', yaxis_title='Count')
fig_purchase.show()

### OUTLIERS

In [49]:
# Age
outliers_age = px.box(df, y='Age', title='Boxplot For Age')
outliers_age.show()

# Estimated Salary
outliers_estimated_salary = px.box(df, y='EstimatedSalary', title='Boxplot For Estimated Salary')
outliers_estimated_salary.show()

### ESTIMATED SALARY BY GENDER

In [50]:
estimated_salary_gender = df.groupby('Gender')['EstimatedSalary'].sum().reset_index()
estimated_salary_gender_plot = px.bar(estimated_salary_gender, x='Gender', y='EstimatedSalary', title='Estimated Salary By Gender')
estimated_salary_gender_plot.show()

In [51]:
estimated_salary_gender

Unnamed: 0,Gender,EstimatedSalary
0,Female,14639000
1,Male,13258000


### CORRELATION MATRIX

In [52]:
correlation_matrix = df.corr(numeric_only=True)
correlation_matrix

Unnamed: 0,Age,EstimatedSalary,Purchased
Age,1.0,0.155238,0.622454
EstimatedSalary,0.155238,1.0,0.362083
Purchased,0.622454,0.362083,1.0


In [53]:
# Plot heatmap
fig_heatmap = px.imshow(correlation_matrix,
                        text_auto=True,
                        title='Correlation Matrix Heatmap',
                        labels={'color':'Correlation'},
                        x=correlation_matrix.columns,
                        y=correlation_matrix.columns)

fig_heatmap.show()

### ENCODE CATEGORICAL VARIABLES

In [54]:
encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])

### FEATURE SELECTION

In [55]:
X = df.drop('Purchased', axis=1)
y = df['Purchased']

### DATA SCALING

Benefits of scaling data

1. Improves algorithm performance

2. Fairness in feature selection

In [56]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

# Convert sacled data to a dataframe
df_X = pd.DataFrame(scaled_X)

# Check the first five rows
df_X.head()

Unnamed: 0,0,1,2
0,1.020204,-1.781797,-1.490046
1,1.020204,-0.253587,-1.460681
2,-0.980196,-1.113206,-0.78529
3,-0.980196,-1.017692,-0.374182
4,1.020204,-1.781797,0.183751


### SPLIT THE DATA

In [57]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)

### MODEL TRAINING

In [63]:
model = LogisticRegression(C=0.1, max_iter=500)
model.fit(X_train, y_train)

### MAKE PREDICTION

In [64]:
y_prediction = model.predict(X_test)

### MODEL COEFFICIENT AND BIAS

In [67]:
weight_coefficient = model.coef_
weight_coefficient

array([[0.0621004 , 1.39046345, 0.79305836]])

In [68]:
bias = model.intercept_
bias

array([-0.86853713])

### MODEL ACCURACY

In [70]:
train_accuracy = model.score(X_train, y_train)
train_accuracy

0.8125

In [71]:
test_accuracy = model.score(X_test, y_test)
test_accuracy

0.8625

In [72]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90        52
           1       0.90      0.68      0.78        28

    accuracy                           0.86        80
   macro avg       0.88      0.82      0.84        80
weighted avg       0.87      0.86      0.86        80



In [74]:
cm = confusion_matrix(y_test, y_prediction)
cm

array([[50,  2],
       [ 9, 19]], dtype=int64)

In [75]:
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
cm_df

Unnamed: 0,Predicted Positive,Predicted Negative
Actual Positive,50,2
Actual Negative,9,19
