In [2]:
import kagglehub
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
import seaborn as sns
from scipy import stats
import sklearn
plt.style.use('fivethirtyeight')

# Download latest version
path = kagglehub.dataset_download("dragonheir/logistic-regression")

print("Path to dataset files:", path)
# get the names of files
files = os.listdir(path)
print(files)

Path to dataset files: C:\Users\DELL\.cache\kagglehub\datasets\dragonheir\logistic-regression\versions\1
['Social_Network_Ads.csv']


In [3]:
train_data = os.path.join(path, files[0])
df = pd.read_csv(train_data, index_col="User ID")
df.rename(columns={'EstimatedSalary':'Salary'}, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400 entries, 15624510 to 15594041
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Gender     400 non-null    object
 1   Age        400 non-null    int64 
 2   Salary     400 non-null    int64 
 3   Purchased  400 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 15.6+ KB


In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.head()

# Univariate

In [None]:
df['Gender'].value_counts().plot.pie(
    autopct="%.1f%%", 
    startangle=90, 
    counterclock=False, 
    colors=["steelblue", "gray"], 
    ylabel="", 
    title="Costumers Distribution by Gender",
    wedgeprops={"edgecolor": "black"}
)

plt.show()

## Age

In [None]:
print("Mean:",df.Age.mean())
print("Median:",df.Age.median())

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 10))  # 2 rows, 1 column

# --- Boxplot ---
df['Age'].plot.box(ax=ax1, vert=False, patch_artist=True, boxprops=dict(facecolor="lightblue", color="k"))
ax1.set_title("Age Distribution (Boxplot)")
ax1.set_xlabel("Age")

# --- Histogram ---
df['Age'].plot.hist(
    ax=ax2,
    edgecolor='k',
    bins=range(20, 61, 5), 
    color="skyblue",
    alpha=0.8
)
ax2.set_title("Age Distribution (Histogram)")
ax2.set_xlabel("Age")
ax2.set_ylabel("Number of People")

plt.tight_layout()
plt.show()


## Salary

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(9, 10))  # 2 rows, 1 column

# --- Boxplot ---
df['Salary'].plot.box(
    ax=ax1, 
    vert=False, 
    patch_artist=True, 
    boxprops=dict(facecolor="lightblue", color="k")
)
ax1.set_title("Salary Distribution (Boxplot)")
ax1.set_xlabel("Salary")

# --- Histogram ---
df['Salary'].hist(
    ax=ax2,
    edgecolor='k',
    bins=range(15000, 140001, 5000),
    color="skyblue",
    alpha=0.8
)
ax2.set_title("Salary Distribution (Histogram)")
ax2.set_xlabel("Salary")
ax2.set_ylabel("Number of People")

plt.tight_layout()
plt.show()


In [None]:
df.columns

## Purchased

In [None]:
df['Purchased'].value_counts().plot.pie(
    autopct='%1.1f%%',       # show percentages with 1 decimal
    startangle=90,           # start from top
    counterclock=False,      # clockwise order
    colors=["skyblue", "orange"],  # distinct but not flashy
    wedgeprops={'edgecolor': 'k'}  # black edges for clarity
)

plt.title("Purchase Distribution")
plt.ylabel("")  # remove y-label (not needed in pie)
plt.show()

# Bivariate

In [None]:
buyers = (df.Purchased == 1)
costumers_group = df.groupby('Purchased')

## Gender

In [None]:
costumers_group.get_group(1)['Gender'].value_counts(normalize=True)

In [None]:
df.loc[buyers, "Gender"].value_counts().plot.pie(
    autopct="%.1f%%", 
    startangle=90, 
    counterclock=False, 
    colors=["steelblue", "gray"], 
    ylabel="", 
    title="Buyers Distribution by Gender",
    wedgeprops={"edgecolor": "black"}
)
plt.show()

## Age

In [None]:
import matplotlib.pyplot as plt

# Get group 1 ages
age_data = costumers_group.get_group(1)['Age']

plt.figure(figsize=(9,6))

# Histogram for overall ages
plt.hist(
    df['Age'].dropna(), 
    bins=range(20, 61, 5), 
    edgecolor='k',
    color="lightblue", 
    alpha=0.6, 
    label="All Customers"
)

# Histogram for group 1 ages
plt.hist(
    age_data.dropna(), 
    bins=range(20, 61, 5), 
    edgecolor='k',
    color="orange", 
    alpha=0.7, 
    label="Buyers"
)

# Labels and legend
plt.title("Age Distribution: All Customers vs Buyers")
plt.xlabel("Age")
plt.ylabel("Number of Customers")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
df.info()

In [None]:
plt.figure(figsize=(9, 6))

# Histogram for all salaries
df['Salary'].plot.hist(
    bins=range(15000, 140001, 5000),   # consistent bins
    alpha=0.6, 
    color="skyblue", 
    edgecolor="k",
    label="All Customers"
)

# Histogram for group 1 salaries
costumers_group.get_group(1)['Salary'].plot.hist(
    bins=range(15000, 140001, 5000),
    alpha=0.7, 
    color="orange", 
    edgecolor="k",
    label="Group 1"
)

# Titles and labels
plt.title("Salary Distribution: All Customers vs buyers")
plt.xlabel("Salary")
plt.ylabel("Number of Customers")
plt.legend()
plt.tight_layout()
plt.show()



In [None]:
corr = df.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(10,6))
sns.heatmap(
    corr, 
    annot=True,        # show correlation values
    fmt=".2f",         # 2 decimal places
    cmap="coolwarm",   # color scheme
    center=0,          # center around 0
    cbar=True, 
    linewidths=0.5, 
    linecolor='k'
)

plt.title("Correlation Matrix of Numeric Features")
plt.show()

## Feature Relationship

In [None]:
plt.figure(figsize=(9,6))
plt.scatter(df['Age'], df['Salary'], alpha=0.5, color="skyblue", edgecolor="k")

plt.title("Salary vs Age (Individual Data)")
plt.xlabel("Age")
plt.ylabel("Salary")
plt.show()


In [None]:
age_salary = df.groupby('Age')['Salary'].median()

plt.figure(figsize=(9,6))
age_salary.plot(kind='line', color="orange")

plt.title("Average Salary by Age")
plt.xlabel("Age")
plt.ylabel("Average Salary")
plt.show()


# Model

## Encoding Gender

In [None]:
df.Gender = (df.Gender == 'Female').astype(int)

## splitting the data

In [None]:
X = df.drop(columns='Purchased').values
y = df.Purchased.values

In [None]:
print(X[:5])

In [None]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.70, random_state=42, stratify=y)
print("Train:", x_train.shape, "Test:", x_test.shape)

In [None]:
x_mean = x_train.mean(axis=0)
x_std = x_train.std(axis=0)

x_norm = (x_train - x_mean) / x_std


## Implementing the model

In [None]:
def sigmoid(z):
    '''
    Args:
    z (ndarray): shape(m,) the linear comination

    Returns: 
    s (ndarray): shape(m,), sigmoid of linear combination
    '''
    s = 1 / (1 + np.exp(-z))
    return s

In [None]:
def predict(x, w, b, threshold=0.4):
    z = x @ w + b
    s = sigmoid(z)
    return (s >= threshold).astype(int)  # returns 0 or 1

In [None]:
def cost(x, y, w, b):
    '''
    Args:
    x (ndarray): shape (m, n). m examples with n features
    y (ndarray): shape (m,). true labels (0 or 1)
    w (ndarray): shape (n,). model parameters
    b (scalar): model parameter (bias)

    Returns:
    c (scalar): the cost of the model
    '''
    z = x @ w + b
    f_wb = sigmoid(z)

    f_wb = np.clip(f_wb, 1e-10, 1 - 1e-10)

    loss =  y * np.log(f_wb) + (1 - y) * np.log(1 - f_wb) # shape (m,)

    c = -1 * np.mean(loss)
    return c

In [None]:
def gradient_calc(x, y, w, b):

    m = x.shape[0]
    z = x @ w + b
    f_wb = sigmoid(z)
    error = f_wb - y
    dj_db = np.mean(error)

    dj_dw = (x.T @ error) / m

    return dj_dw, dj_db

In [None]:
def gradient_dec(x, y, w, b, alpha=0.01, iters=1000, show_process=True):
    """
    Gradient descent for logistic regression.

    Args:
        x (ndarray): shape (m, n) feature matrix
        y (ndarray): shape (m,) target labels
        w (ndarray): shape (n,) initial weights
        b (float): initial bias
        alpha (float): learning rate
        iters (int): number of iterations
        show_process (bool): whether to print progress

    Returns:
        w (ndarray): learned weights
        b (float): learned bias
        cost_history (list): cost per iteration
    """
    print_count = max(1, iters // 10)
    cost_history = []

    for i in range(iters):
        dj_dw, dj_db = gradient_calc(x, y, w, b)   # gradient
        c = cost(x, y, w, b)                      # cost
        cost_history.append(c)

        # update parameters
        w -= alpha * dj_dw
        b -= alpha * dj_db

        # optional printing
        if i % print_count == 0 and show_process:
            print(f"Iteration {i}: cost = {c:.4f}, w = {w}, b = {b:.4f}")

    return w, b, cost_history

In [None]:
w = np.array([0., 0., 0.])
b = 0
alpha = 1
iters = 1000
w, b, _ = gradient_dec(x_norm, y_train, w, b, alpha, iters)

In [None]:
w_ori = w / x_std
b_ori = b - np.sum((w * x_mean) / x_std)
print(w_ori, b_ori)

In [None]:
print('train cost:', cost(x_train, y_train, w_ori, b_ori))
print('test cost:', cost(x_test, y_test, w_ori, b_ori))

In [None]:
sigmoid(np.array([1, 66, 4000]) @ w_ori + b_ori) * 100

In [None]:
predicted_true = predict(x_train, w_ori, b_ori).sum()
predicted_false = (predict(x_train, w_ori, b_ori) == 0).sum()

actual_true = (y == 1).sum()
actual_false = (y ==0).sum()
print(predicted_true, predicted_false, actual_true, actual_false)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

y_pred = predict(x_train, w_ori, b_ori)

cm = confusion_matrix(y_train, y_pred)

# Plot with seaborn heatmap (cleaner)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Pred 0", "Pred 1"],
            yticklabels=["Actual 0", "Actual 1"])
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.title("Confusion Matrix")
plt.show()

In [None]:
from sklearn.metrics import precision_score, recall_score

y_pred = predict(x_train, w_ori, b_ori)

precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)

print("Precision:", precision)
print("Recall:", recall)
