# Importing Libraries.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load dataset
df = pd.read_csv('System Data.csv')
df.head()


Unnamed: 0,Timestamp,Age,Gender,Monthly Income,Region,Frequency of Shopping In Years,Average spending,Categories,Means of Payment,Entrolled on Jumia Prime or any loyalty program,Frequency of shopping,Rate of Satisfaction,Rate of availability of products,Reason for your purchase,Device to shop,nternet connection used,Recommendation to others
0,10/10/2024 12:46,25-34,Male,"<450,000",Central,"Daily, A few times a year","<50,000",Electronics,"Mobile Money, Cash on Delivery",No,Rarely,3,2,"Convenience, Delivery speed","Smart phones, Laptops","Mobile data, Public Wi-Fi",Yes
1,10/10/2024 14:56,25-34,Male,,Northern,A few times a year,"50,000-100,000",Electronics,Cash on Delivery,No,Rarely,5,3,Product variety,Smart phones,Home Wi-Fi,No
2,10/10/2024 15:01,25-34,Male,">2,000,000",Western,A few times a year,"50,000-100,000",Electronics,"Mobile Money, Cash on Delivery",No,Rarely,3,3,"Price, Convenience, Discounts/offers",Smart phones,Mobile data,Yes
3,10/10/2024 15:03,35-44,Male,"450,000-1,000,000",Central,A few times a year,"<50,000",Health & Beauty,Cash on Delivery,No,Rarely,1,3,Price,Smart phones,Mobile data,Yes
4,10/10/2024 15:04,25-34,Male,,Northern,A few times a year,"50,000-100,000",Electronics,Mobile Money,No,Occassionally,3,3,Convenience,Smart phones,Mobile data,Yes


# Data Cleaning

In [3]:
df.columns = df.columns.str.strip()
df = df.drop(columns=['Timestamp'])  # Drop Timestamp as it’s not relevant for segmentation


In [4]:
# Define categorical and numerical columns
categorical_cols = ['Age', 'Gender', 'Monthly Income', 'Region', 'Frequency of Shopping', 
                    'Average spending', 'Categories', 'Means of Payment', 
                    'Entrolled on Jumia Prime or any loyalty program', 'Frequency of shopping', 
                    'Reason for your purchase', 'Device to shop', 'nternet connection used', 
                    'Recommendation to others']
numerical_cols = ['Rate of Satisfaction', 'Rate of availability of products']

In [5]:
# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string to handle mixed types
    label_encoders[col] = le  # Save encoders for later use

KeyError: 'Frequency of Shopping'

In [None]:
# Combine features
X = df[categorical_cols + numerical_cols]

In [None]:
# Standardize the data (crucial for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
print("Preprocessed data shape:", X_scaled.shape)

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Calculate inertia for different k values
inertia = []
K = range(1, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

# Applying K-Means

In [None]:
# Apply K-Means with chosen k
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to the original dataframe
df['Cluster'] = clusters

# Display the first few rows with cluster assignments
print("Data with cluster labels:")
print(df.head())

# Analyze the Clusters

In [None]:
# Group by cluster and calculate means for numerical features
print("Cluster Characteristics (Numerical Features):")
print(df.groupby('Cluster')[numerical_cols].mean())

# Analyze categorical features (e.g., mode or counts)
for col in categorical_cols:
    print(f"\nCluster breakdown for {col}:")
    print(df.groupby('Cluster')[col].value_counts(normalize=True))

# Visualize the Clusters

In [None]:
import seaborn as sns

# Scatter plot
sns.scatterplot(x=df['Rate of Satisfaction'], y=df['Average spending'], hue=df['Cluster'], palette='deep')
plt.xlabel('Rate of Satisfaction')
plt.ylabel('Average Spending')
plt.title('Customer Segments')
plt.show()

In [None]:
import joblib

# Save the K-Means model, scaler, and label encoders
joblib.dump(kmeans, 'kmeans_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

print("K-Means model, scaler, and encoders saved successfully.")

In [None]:
from sklearn.cluster import KMeans

# Train the K-Means model with K=4
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(df)

# Display first few rows with cluster assignments
print(df[['Cluster']].head())


In [None]:
import joblib

# Load the saved files
kmeans = joblib.load('kmeans_model.pkl')
scaler = joblib.load('scaler.pkl')
label_encoders = joblib.load('label_encoders.pkl')

# Verify contents
print("K-Means clusters:", kmeans.n_clusters)
print("Scaler mean (first few values):", scaler.mean_[:5])
print("Label encoders keys:", list(label_encoders.keys()))

# Step 2: Split Data into Training, Validation, and Test Sets
# Split the data into:

# Training (70%): For model training.

# Validation (15%): For hyperparameter tuning.

# Test (15%): For final evaluation.

