In [6]:
import numpy as np
import pandas as pd

# data
data = {
    'Age': [23, 31, 19, 28, 35, 22, 29],
    'Tickets': [5, 2, 4, 7, 1, 6, 3],
    'Time': [43, 15, 57, 62, 11, 39, 20],
    'Spend': [380, 120, 265, 490, 95, 320, 200]
}
df = pd.DataFrame(data)

# Part (a): Standardization & Regression
print("PART (a): STANDARDIZATION & REGRESSION ")

# Standardize
df_std = (df - df.mean()) / df.std()
print("Standardized Data:")
print(df_std.round(3))

# Fit regression
X = np.column_stack([np.ones(len(df)), df_std[['Age', 'Tickets', 'Time']].values])
y = df_std['Spend'].values.reshape(-1, 1)
beta_std = np.linalg.inv(X.T @ X) @ X.T @ y

print("\nStandardized Coefficients:")
print(f"β₀ (Intercept): {beta_std[0][0]:.3f}")
print(f"β₁ (Age): {beta_std[1][0]:.3f}")
print(f"β₂ (Tickets): {beta_std[2][0]:.3f}")
print(f"β₃ (Time): {beta_std[3][0]:.3f}")

# Convert to original scale
means = df.mean()
stds = df.std()
beta_original = beta_std.flatten().copy()
beta_original[1:] = beta_original[1:] * (stds['Spend'] / stds[['Age', 'Tickets', 'Time']].values)
beta_original[0] = means['Spend'] - np.sum(beta_original[1:] * means[['Age', 'Tickets', 'Time']].values)

print("\nOriginal Scale Coefficients:")
print(f"Intercept: {beta_original[0]:.3f}")
print(f"Age: {beta_original[1]:.3f}")
print(f"Tickets: {beta_original[2]:.3f}")
print(f"Time: {beta_original[3]:.3f}")



PART (a): STANDARDIZATION & REGRESSION 
Standardized Data:
     Age  Tickets   Time  Spend
0 -0.661    0.463  0.378  0.795
1  0.763   -0.926 -0.995 -1.036
2 -1.373    0.000  1.065 -0.015
3  0.229    1.389  1.310  1.569
4  1.475   -1.389 -1.191 -1.212
5 -0.839    0.926  0.182  0.372
6  0.407   -0.463 -0.750 -0.473

Standardized Coefficients:
β₀ (Intercept): -0.000
β₁ (Age): 0.212
β₂ (Tickets): 0.776
β₃ (Time): 0.370

Original Scale Coefficients:
Intercept: -171.332
Age: 5.366
Tickets: 51.016
Time: 2.580


In [7]:
#Part (c): Prediction
print("\n PART (c): PREDICTION ")
new_age, new_tickets, new_time = 30, 4, 50
spend_pred = (beta_original[0] + 
            beta_original[1]*new_age + 
            beta_original[2]*new_tickets + 
            beta_original[3]*new_time)
print(f"Predicted Spend for Age={new_age}, Tickets={new_tickets}, Time={new_time}: {spend_pred:.2f}")

# Association metrics
print("\n ASSOCIATION METRICS ")
time_gt_40 = df['Time'] > 40
spend_gt_300 = df['Spend'] > 300
support_time = time_gt_40.mean()
support_spend = spend_gt_300.mean()
support_both = (time_gt_40 & spend_gt_300).mean()

confidence = support_both / support_time
lift = support_both / (support_time * support_spend)

print(f"Rule: Time > 40 → Spend > 300")
print(f"Support(Time>40) = {support_time:.3f}")
print(f"Support(Spend>300) = {support_spend:.3f}")
print(f"Support(Both) = {support_both:.3f}")
print(f"Confidence = {confidence:.3f} ({confidence*100:.1f}%)")
print(f"Lift = {lift:.3f}")

# Classification rule
print("\n DAYTYPE CLASSIFICATION ")
df['DayType'] = np.where(df['Time'] > 50, 'Weekend',
                np.where(df['Tickets'] > 5, 'Weekend', 'Weekday'))
print("Simple Rule: IF Time>50 OR Tickets>5 THEN Weekend ELSE Weekday")
print(df[['Age', 'Tickets', 'Time', 'Spend', 'DayType']])


 PART (c): PREDICTION 
Predicted Spend for Age=30, Tickets=4, Time=50: 322.74

 ASSOCIATION METRICS 
Rule: Time > 40 → Spend > 300
Support(Time>40) = 0.429
Support(Spend>300) = 0.429
Support(Both) = 0.286
Confidence = 0.667 (66.7%)
Lift = 1.556

 DAYTYPE CLASSIFICATION 
Simple Rule: IF Time>50 OR Tickets>5 THEN Weekend ELSE Weekday
   Age  Tickets  Time  Spend  DayType
0   23        5    43    380  Weekday
1   31        2    15    120  Weekday
2   19        4    57    265  Weekend
3   28        7    62    490  Weekend
4   35        1    11     95  Weekday
5   22        6    39    320  Weekend
6   29        3    20    200  Weekday


### QUESTION TWO


In [9]:
print("QUESTION 2: SALES & CUSTOMER SEGMENTATION")

# Part (a): Multidimensional Cube
print("\n(a) MULTIDIMENSIONAL CUBE")

sales_data = {
   'Region': ['North', 'South', 'East'],
   'Jan': [120, 95, 140],
   'Feb': [135, 110, 150],
   'Mar': [128, 102, 145]
}
sales_df = pd.DataFrame(sales_data).set_index('Region')
print("Monthly Sales Data (KSh '000):")
print(sales_df)
print(f"\nFeb sales for East: {sales_df.loc['East', 'Feb']} KSh '000")


QUESTION 2: SALES & CUSTOMER SEGMENTATION

(a) MULTIDIMENSIONAL CUBE
Monthly Sales Data (KSh '000):
        Jan  Feb  Mar
Region               
North   120  135  128
South    95  110  102
East    140  150  145

Feb sales for East: 150 KSh '000


In [11]:
# Part (b): Normalization
print("\n\n(b) DATA NORMALIZATION")

segmentation_data = {
    'Age': [25, 32, 22, 28],
    'Income': [50, 70, 45, 60],
    'PurchaseFrequency': [5, 3, 6, 4]
}
seg_df = pd.DataFrame(segmentation_data)

# Min-Max Normalization
def min_max_normalize(df):
    return (df - df.min()) / (df.max() - df.min())

seg_norm = min_max_normalize(seg_df)
print("Customer Segmentation Data (Original):")
print(seg_df)
print("\nMin-Max Normalized Data:")
print(seg_norm.round(3))



(b) DATA NORMALIZATION
Customer Segmentation Data (Original):
   Age  Income  PurchaseFrequency
0   25      50                  5
1   32      70                  3
2   22      45                  6
3   28      60                  4

Min-Max Normalized Data:
   Age  Income  PurchaseFrequency
0  0.3     0.2              0.667
1  1.0     1.0              0.000
2  0.0     0.0              1.000
3  0.6     0.6              0.333


In [12]:
# Part (c): Clustering
print("\n\n(c) K-MEANS CLUSTERING (k=2)")

from sklearn.cluster import KMeans

# Apply K-means
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
seg_df['Cluster'] = kmeans.fit_predict(seg_norm)

print("Cluster Assignments:")
print(seg_df[['Age', 'Income', 'PurchaseFrequency', 'Cluster']])
print(f"\nCluster Centroids:")
print(pd.DataFrame(kmeans.cluster_centers_, 
                columns=['Age_norm', 'Income_norm', 'Freq_norm']))



(c) K-MEANS CLUSTERING (k=2)
Cluster Assignments:
   Age  Income  PurchaseFrequency  Cluster
0   25      50                  5        1
1   32      70                  3        0
2   22      45                  6        1
3   28      60                  4        0

Cluster Centroids:
   Age_norm  Income_norm  Freq_norm
0      0.80          0.8   0.166667
1      0.15          0.1   0.833333


In [13]:
# Calculate actual centroids in original scale
cluster_0 = seg_df[seg_df['Cluster'] == 0][['Age', 'Income', 'PurchaseFrequency']]
cluster_1 = seg_df[seg_df['Cluster'] == 1][['Age', 'Income', 'PurchaseFrequency']]

print("\nActual Centroids (Original Scale):")
print("Cluster 0:", cluster_0.mean().round(2).values)
print("Cluster 1:", cluster_1.mean().round(2).values)


Actual Centroids (Original Scale):
Cluster 0: [30.  65.   3.5]
Cluster 1: [23.5 47.5  5.5]



# Part (d): Cross-Validation & Ensemble

1. Leave-One-Out (LOO):
   - Train on n-1 samples, test on 1, repeat n times
   - Low bias but high variance
   - Computationally expensive for large n

2. k-Fold Cross-Validation:
   - Split data into k equal folds
   - Train on k-1 folds, test on remaining fold
   - Better bias-variance tradeoff
   - Typically k=5 or k=10

Ensemble Methods Improve Accuracy By:
1. Reducing variance (Bagging): Average multiple models
2. Reducing bias (Boosting): Sequentially correct errors  
3. Increasing diversity: Different models capture different patterns
