In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import Binarizer, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import scipy.cluster.hierarchy as sch
from sklearn.metrics import silhouette_score, calinski_harabasz_score

In [None]:
# Load the data
mcdonalds = pd.read_csv(r"C:\Users\jadha\Downloads\McDonalds Case Study-20240819T164052Z-001\McDonalds Case Study\mcdonalds.csv")

In [None]:
# Display the first 3 rows of the dataframe
mcdonalds.head(3)

In [None]:
# Check column names
print(mcdonalds.columns.tolist())

# Check the dimensions of the dataframe
print(mcdonalds.shape)

In [None]:
# Convert to binary matrix (1 for 'Yes', 0 for 'No')
MD_x = mcdonalds.iloc[:, 0:11]
MD_x_binary = (MD_x == "Yes").astype(int)

In [None]:
# Calculate and round the column means
column_means = MD_x_binary.mean().round(2)
print(column_means)

In [None]:
# Perform Principal Component Analysis (PCA)
pca = PCA()
MD_pca = pca.fit_transform(MD_x_binary)

In [None]:
# Print the summary of PCA
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()

In [None]:
print("Standard deviation of principal components:",pca.singular_values_)

In [None]:
print("Proportion of Variance:",explained_variance)

In [None]:
print("Cumulative Proportion:",cumulative_explained_variance)

In [None]:
print("Factor loadings (first 2 components):")
loadings = pca.components_.T
print(pd.DataFrame(loadings, columns=[f'PC{i+1}' for i in range(loadings.shape[1])], index=MD_x_binary.columns).round(1))

In [None]:
# Plot the PCA projections
plt.figure(figsize=(8, 6))
plt.scatter(MD_pca[:, 0], MD_pca[:, 1], c='grey', edgecolor='k', alpha=0.7)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Projection')
plt.grid(True)
plt.show()

In [None]:
# To simulate projAxes functionality, plotting the PCA components as vectors
plt.figure(figsize=(8, 6))
plt.quiver(0, 0, pca.components_[0, 0], pca.components_[0, 1], angles='xy', scale_units='xy', scale=1, color='r', label='PC1')
plt.quiver(0, 0, pca.components_[1, 0], pca.components_[1, 1], angles='xy', scale_units='xy', scale=1, color='b', label='PC2')
plt.xlim(-1, 1)
plt.ylim(-1, 1)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Projection with Component Axes')
plt.axhline(0, color='grey', linestyle='--')
plt.axvline(0, color='grey', linestyle='--')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Define range of clusters
cluster_range = range(2, 9)  # 2 to 8 clusters
inertia = []

# Perform k-means clustering for different numbers of clusters
for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=1234)
    kmeans.fit(MD_x_binary)
    inertia.append(kmeans.inertia_)

In [None]:
# Perform k-means clustering with a specified number of clusters
n_clusters = 3  # Choose a number of clusters, e.g., 3
kmeans = KMeans(n_clusters=n_clusters, random_state=1234)
kmeans.fit(MD_x_binary)

In [None]:
# Relabel clusters to start from 0
def relabel_clusters(labels):
    le = LabelEncoder()
    return le.fit_transform(labels)

In [None]:
# Apply relabeling
relabelled_labels = relabel_clusters(kmeans.labels_)
# Add the cluster labels to the original dataframe
mcdonalds['Cluster'] = relabelled_labels
# Display the first few rows of the dataframe with cluster labels
print(mcdonalds.head())

In [None]:
# Plot the inertia for different numbers of clusters
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, inertia, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia (Within-cluster Sum of Squares)')
plt.title('K-Means Clustering Inertia for Different Numbers of Clusters')
plt.grid(True)
plt.xticks(cluster_range)
plt.show()

In [None]:
# Function to perform Gaussian Mixture Model clustering and return results
def fit_gmm_models(X, cluster_range, nrep):
    models = {}
    for n_clusters in cluster_range:
        models[n_clusters] = []
        for _ in range(nrep):
            gmm = GaussianMixture(n_components=n_clusters, random_state=1234)
            gmm.fit(X)
            models[n_clusters].append(gmm)
    return models

In [None]:
# Perform clustering with different numbers of clusters (2 to 8)
cluster_range = range(2, 9)
nrep = 10
gmm_models = fit_gmm_models(MD_x_binary, cluster_range, nrep)

# Example to display one of the GMM models
n_clusters_example = 4
example_gmm = gmm_models[n_clusters_example][0]

# Print the model parameters
print(f"Gaussian Mixture Model with {n_clusters_example} clusters:")
print("Means:")
print(example_gmm.means_)
print("Covariances:")
print(example_gmm.covariances_)
print("Weights:")
print(example_gmm.weights_)

In [None]:
# Predict clusters for the original data with the example model
labels = example_gmm.predict(MD_x_binary)

In [None]:
# Relabel clusters to start from 0
def relabel_clusters(labels):
    le = LabelEncoder()
    return le.fit_transform(labels)

# Apply relabeling
relabelled_labels = relabel_clusters(labels)

# Add the cluster labels to the original dataframe
mcdonalds['Cluster'] = relabelled_labels

# Display the first few rows of the dataframe with cluster labels
print(mcdonalds.head())

In [None]:
# Plot the cluster means for a specific number of clusters
plt.figure(figsize=(8, 6))
for i in range(n_clusters_example):
    plt.scatter(MD_x_binary.iloc[:, 0], MD_x_binary.iloc[:, 1], c=(relabelled_labels == i).astype(int), label=f'Cluster {i}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title(f'Cluster Plot for {n_clusters_example} Clusters')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Profiling Segments
numeric_data = mcdonalds.select_dtypes(include=[np.number])  # Ensure we only use numeric data
segment_profiles = numeric_data.groupby('Cluster').mean()
print("Segment Profiles:")
print(segment_profiles)

# Describing Segments
segment_descriptions = numeric_data.groupby('Cluster').describe()
print("Segment Descriptions:")
print(segment_descriptions)


In [None]:
# Ensure all columns in MD_x_binary are numeric
print(MD_x_binary.dtypes)

# Selecting Target Variable
target_variable = 'Like.n'
print(f"Target Variable: {target_variable}")

# Customizing Market Mix
# Ensure only numeric columns are used for market mix customization
numeric_columns = mcdonalds.select_dtypes(include=[np.number]).columns
market_mix = mcdonalds[numeric_columns + ['Cluster']].groupby('Cluster').mean()
print("Market Mix Customization:")
print(market_mix)

In [None]:
# Fit regression model for each cluster
regression_results = {}
formula = 'Like.n ~ ' + ' + '.join(MD_x.columns)
for cluster in np.unique(mcdonalds['Cluster']):
    cluster_data = mcdonalds[mcdonalds['Cluster'] == cluster]
    model = smf.ols(formula=formula, data=cluster_data).fit()
    regression_results[cluster] = model.summary()


In [None]:
# Display the regression results for each cluster
for cluster, result in regression_results.items():
    print(f"Cluster {cluster} Regression Summary:")
    print(result)


In [None]:
# Evaluation and Monitoring
# Calculate silhouette score for clustering
silhouette_avg = silhouette_score(MD_x_binary, relabelled_labels)
print(f"Silhouette Score: {silhouette_avg}")

# Calculate Calinski-Harabasz score for clustering
calinski_harabasz_avg = calinski_harabasz_score(MD_x_binary, relabelled_labels)
print(f"Calinski-Harabasz Score: {calinski_harabasz_avg}")

# Hierarchical clustering and ordering
order = sch.linkage(MD_x_binary.T, method='ward')
dendrogram = sch.dendrogram(order)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Features')
plt.ylabel('Euclidean Distance')
plt.show()