Practical Problem

Question 1

In [10]:
import pandas as pd
import numpy as np
from math import log2

# Step 1: Load data (Step 1: Extract Relevant Data)
data = pd.read_csv('training_dataset.csv')

# Drop rows with missing RiskLevel (safe check for entropy calculation)
data = data.dropna(subset=['RiskLevel'])

# Function to compute entropy (used in Step 2, 3, 4)
def entropy(labels):
    total = len(labels)
    if total == 0:
        return 0
    probs = [labels.count(label) / total for label in set(labels)]
    return -sum(p * log2(p) for p in probs if p > 0)

# Step 2: Compute total entropy H(S)
total_entropy = entropy(data['RiskLevel'].tolist())


# Step 3: Split data at CreditScore = 650
split_value = 650
lower_group = data[data['CreditScore'] <= split_value]  # S1
upper_group = data[data['CreditScore'] > split_value]   # S2

# Step 4: Compute entropy of each group
entropy_lower = entropy(lower_group['RiskLevel'].tolist())  # H(S1)
entropy_upper = entropy(upper_group['RiskLevel'].tolist())  # H(S2)

# Step 4 (continued): Weighted average entropy after split
n = len(data)
weighted_entropy = (len(lower_group) / n) * entropy_lower + (len(upper_group) / n) * entropy_upper

# Step 5: Compute Information Gain
info_gain = total_entropy - weighted_entropy

# Step 6: Output results and interpretation
print(f"Total Entropy before split: {total_entropy:.4f}")                         # Step 2
print(f"Entropy (CreditScore <= {split_value}): {entropy_lower:.4f}")            # Step 3
print(f"Entropy (CreditScore > {split_value}): {entropy_upper:.4f}")             # Step 3
print(f"Weighted Entropy after split: {weighted_entropy:.4f}")                   # Step 4
print(f"Information Gain for split at {split_value}: {info_gain:.4f}")           # Step 5

Total Entropy before split: 1.0000
Entropy (CreditScore <= 650): -0.0000
Entropy (CreditScore > 650): -0.0000
Weighted Entropy after split: -0.0000
Information Gain for split at 650: 1.0000


Question 2

In [11]:
import pandas as pd

# Load training data
df = pd.read_csv('training_dataset.csv')

# Step 1: Drop rows with missing CreditScore (target)
df = df.dropna(subset=['CreditScore'])

# Step 2: Split data at Age = 35
split_value = 35
left_group = df[df['Age'] <= split_value]
right_group = df[df['Age'] > split_value]

# Step 3: Define variance function
def variance(values):
    if len(values) == 0:
        return 0
    return values.var(ddof=0)  # population variance

# Step 4: Calculate total variance before the split
total_variance = variance(df['CreditScore'])

# Step 5: Calculate weighted variance after split
left_var = variance(left_group['CreditScore'])
right_var = variance(right_group['CreditScore'])

n = len(df)
weighted_var = (len(left_group)/n) * left_var + (len(right_group)/n) * right_var

# Step 6: Calculate variance reduction
var_reduction = total_variance - weighted_var

# Step 7: Output results
print(f"Total Variance before split: {total_variance:.2f}")
print(f"Variance (Age <= {split_value}): {left_var:.2f}")
print(f"Variance (Age > {split_value}): {right_var:.2f}")
print(f"Weighted Variance after split: {weighted_var:.2f}")
print(f"Variance Reduction: {var_reduction:.2f}")

Total Variance before split: 3575.00
Variance (Age <= 35): 1576.00
Variance (Age > 35): 822.22
Weighted Variance after split: 1293.33
Variance Reduction: 2281.67


Question 3

In [12]:
import pandas as pd

# Step 1: Load data
train = pd.read_csv("training_dataset.csv")
test = pd.read_csv("test_dataset.csv")

# Step 2: Drop rows with missing RiskLevel in training data
train = train.dropna(subset=["RiskLevel"])

# Step 3: Extract test case T2
t2 = test[test['ID'] == 'T2'].iloc[0]
t2_age = t2['Age']
t2_credit = t2['CreditScore']

# Step 4: Define similarity based on Age ±3 and CreditScore ±20
def is_similar(row, age_tol=3, credit_tol=20):
    return abs(row['Age'] - t2_age) <= age_tol and abs(row['CreditScore'] - t2_credit) <= credit_tol

# Step 5: Select similar training rows
similar_rows = train[train.apply(is_similar, axis=1)]

# Step 6: Calculate proportions of RiskLevel
risk_counts = similar_rows['RiskLevel'].value_counts(normalize=True)

# Step 7: Output estimated probabilities
p_high = risk_counts.get('High', 0)
p_low = risk_counts.get('Low', 0)

print(f"Number of similar training samples: {len(similar_rows)}")
print(f"Probability of High Risk for T2: {p_high:.2f}")
print(f"Probability of Low Risk for T2: {p_low:.2f}")

Number of similar training samples: 3
Probability of High Risk for T2: 1.00
Probability of Low Risk for T2: 0.00


Question 4

In [13]:
import pandas as pd
import numpy as np

# Step 1: Load training data
df = pd.read_csv('training_dataset.csv')

# Step 2: Remove rows with missing CreditScore
df = df.dropna(subset=['CreditScore'])

# Step 3: Extract feature and target
X = df['Age'].values
y = df['CreditScore'].values
n = len(X)

# Step 4: Initialize parameters
theta_0 = 500
theta_1 = 5
alpha = 0.01

# Step 5: Compute predictions
y_pred = theta_0 + theta_1 * X

# Step 6: Compute cost function (Mean Squared Error)
cost = np.mean((y_pred - y) ** 2)

# Step 7: Compute gradients
grad_0 = (2/n) * np.sum(y_pred - y)
grad_1 = (2/n) * np.sum((y_pred - y) * X)

# Step 8: Update parameters
theta_0_new = theta_0 - alpha * grad_0
theta_1_new = theta_1 - alpha * grad_1

# Step 9: Output results
print(f"Initial Cost: {cost:.2f}")
print(f"Gradient θ₀: {grad_0:.2f}, θ₁: {grad_1:.2f}")
print(f"Updated θ₀: {theta_0_new:.2f}")
print(f"Updated θ₁: {theta_1_new:.2f}")

Initial Cost: 878.12
Gradient θ₀: -1.25, θ₁: -263.75
Updated θ₀: 500.01
Updated θ₁: 7.64
