# Modeling

## Apply K means algorithm to find clusters of different types of customers

In [None]:
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Encode categorical features
label_encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

# Select numeric columns for scaling
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaled_data = StandardScaler().fit_transform(df[numeric_cols])

# Elbow method
inertia = []
K = range(1, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)  # Use scaled_data here
    inertia.append(kmeans.inertia_)

# Plot the Elbow Method
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia (Sum of Squared Distances)')
plt.show()


In [None]:
from sklearn.metrics import silhouette_score
silhouette_avg = []
K = range(2, 10)  # Silhouette score requires at least two clusters

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_data)
    silhouette_avg.append(silhouette_score(scaled_data, labels))

# Plot Silhouette Score
plt.plot(K, silhouette_avg, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score');

In [None]:
# Perform PCA first
pca = PCA(n_components=2)
X_pca = pca.fit_transform(scaled_data)

# Apply KMeans clustering on the PCA-transformed data
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X_pca)

# Recalculate centroids in the PCA space
centroids_pca = kmeans.cluster_centers_

# Plotting
fig, ax = plt.subplots(figsize=(8, 6))

# Plot data points with cluster labels
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, s=60, cmap='viridis')
ax.set_title('Learned Cluster Labels with Centroids', fontsize=14)
ax.set_xlabel('PCA Feature 1', fontsize=12)
ax.set_ylabel('PCA Feature 2', fontsize=12)
fig.colorbar(scatter, ax=ax, label='Cluster Label')

# Plot centroids
ax.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='black', s=200, alpha=0.6, marker='X', label='Centroids')
ax.legend()
plt.show()

In [None]:
# Create a DataFrame with the original data and cluster labels
original_clustered_df = df.copy()
original_clustered_df['Cluster'] = labels

# Print 10 samples for each cluster
for cluster in range(3):  # assuming 3 clusters
    print(f"\n--- Cluster {cluster} Samples ---")
    print(original_clustered_df[original_clustered_df['Cluster'] == cluster].sample(n=10, random_state=42))

# Detailed Cluster Analysis
## 1. Demographic Composition
Cluster 1:

Age Balance: 50/50 split between 18-30 and 31-50 age groups
Gender Diversity: Predominantly Female, with some Male and "Prefer not to say"
Most diverse demographic profile

Cluster 2:

Age Skew: Heavily weighted towards 18-30 (80% of sample)
Gender: More varied gender representation
Younger, more diverse demographic

Cluster 3:

Age Concentration: Strongly 31-50 age group (90% of sample)
Gender: More Male-dominated
Mature, stable demographic

## 2. Purchase Behavior
Cluster 1:

Purchase Frequency: Most variable
Categories: Widest range (Beauty, Clothing, Groceries, Others)
Least predictable purchasing pattern

Cluster 2:

Purchase Frequency: Consistent "Few times a week"
Strong focus on Beauty and Personal Care
More intentional, planned purchasing

Cluster 3:

Purchase Frequency: Most conservative (Once a month to Few times a month)
Practical categories: Groceries, Home & Kitchen
Most budget-conscious cluster

## 3. Digital Interaction Patterns
Cluster 1:

Browsing: Mostly "Few times a month"
Search Methods: Most eclectic (categories, keywords mixed)
Exploration: Balanced between first page and multiple pages

Cluster 2:

Browsing: More frequent (Few times a week)
Search Methods: Most sophisticated (heavy filter use)
Exploration: Consistently multi-page browsers
Most digitally engaged cluster

Cluster 3:

Browsing: Moderate frequency
Search Methods: Most pragmatic
Exploration: Mix of first page and multiple pages
Most utilitarian digital interaction

## 4. Recommendation and Personalization
Cluster 1:

Personalized Recommendation Purchase: Mostly "Sometimes"
Moderate openness to personalization
Inconsistent recommendation response

Cluster 2:

Personalized Recommendation Purchase: More "Yes" responses
Highest recommendation frequency
Most receptive to personalized marketing

Cluster 3:

Personalized Recommendation Purchase: Mixed, leaning conservative
Least responsive to personalization
Most skeptical about recommendations

## 5. Review and Satisfaction Dynamics
Cluster 1:

Review Reliability: Most varied (Rarely to Heavily)
Review Helpfulness: Inconsistent
Shopping Satisfaction: Low (mostly 1-2)

Cluster 2:

Review Reliability: Consistently Moderate
Review Helpfulness: More positive
Shopping Satisfaction: Moderate (3-4)
Most balanced review interaction

Cluster 3:

Review Reliability: Lower (Rarely to Occasionally)
Review Helpfulness: Most conservative
Shopping Satisfaction: Lowest (1-2)
Most critical customer segment

## 6. Improvement and Service Appreciation
Cluster 1:

Service Appreciation: Consistently high (7-8)
Improvement Areas: Widely varied
More forgiving, constructive feedback

Cluster 2:

Service Appreciation: Lowest and most variable
Improvement Areas: Focused on specific aspects
Most vocal about potential improvements

Cluster 3:

Service Appreciation: Mixed
Improvement Areas: Practical, specific concerns
Most direct in feedback

 ## Cluster 1: The Casual, Mixed-Interest Shoppers 
Key Characteristics:

Age Range: Predominantly 18-30 and 31-50
Purchase Frequency: Varied (few times a month to multiple times a week)
Browsing Behavior:

Mostly browse few times a month
Mix of search methods (categories, keywords)
Tend to explore multiple pages or first page of search results


Shopping Preferences:

Diverse purchase categories (Beauty, Clothing, Groceries)
Moderate use of personalized recommendations
Mixed attitude towards add-to-cart behavior


Review Interaction:

Varied review reliability (from rarely to heavily)
Mixed review helpfulness


Satisfaction Levels:

Moderate shopping satisfaction (mostly 1-2 on a scale)
Moderate service appreciation



## Cluster 2: The Exploratory Shoppers
Key Characteristics:

Age Range: Primarily 18-30, some 31-50
Purchase Frequency: Moderate (few times a week to few times a month)
Browsing Behavior:

Consistent few times a week browsing
Diverse search methods (categories, filters, keywords)
Tend to explore multiple pages


Shopping Preferences:

Strong focus on Beauty and Personal Care
More likely to use personalized recommendations
More likely to add items to cart


Review Interaction:

Moderate to high review reliability
More positive about review helpfulness


Satisfaction Levels:

Slightly higher shopping satisfaction (3-4 range)
Lower service appreciation


Unique Trait: More open to personalized recommendations and exploring product options

## Cluster 3: The Occasional Shoppers
Key Characteristics:

Age Range: Predominantly 31-50
Purchase Frequency: Moderate (once a month to few times a month)
Browsing Behavior:

Consistent few times a week to few times a month browsing
Mix of search methods
Often explore multiple pages


Shopping Preferences:

Focused on Groceries, Beauty, and Home/Kitchen
Moderate use of personalized recommendations
More cautious about adding to cart


Review Interaction:

Lower review reliability (rarely to occasionally)
More conservative about review helpfulness


Satisfaction Levels:

Lower shopping satisfaction (mostly 1-2 range)
Mixed service appreciation

### Random Forest to determine which feature has the greatest importance on Purchase Frequency

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Define the new target variable
target_variable = 'Cart_Abandonment_Factors'

# Drop rows with missing target values
df = df.dropna(subset=[target_variable])

# Define feature columns
features = [
    'Gender', 'Browsing_Frequency', 'Product_Search_Method',
    'Search_Result_Exploration', 'Customer_Reviews_Importance',
    'Add_to_Cart_Browsing', 'Cart_Completion_Frequency', 
    'Saveforlater_Frequency', 'Review_Left', 'Review_Reliability',
    'Review_Helpfulness', 'Shopping_Satisfaction'
]

# Encode categorical features
label_encoder = LabelEncoder()
for col in features:
    if df[col].dtype == 'object' or isinstance(df[col].iloc[0], str):  # Check for string dtype
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# Ensure target is encoded
if df[target_variable].dtype == 'object' or isinstance(df[target_variable].iloc[0], str):
    df[target_variable] = label_encoder.fit_transform(df[target_variable].astype(str))

# Define features (X) and target (y)
X = df[features]
y = df[target_variable]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Feature importance
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importance on Cart Abandonment Factors')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

# Display feature importance
print(feature_importances)


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Map Purchase Frequency to numeric values
purchase_frequency_mapping = {
    "Few times a month": 2,
    "Once a month": 1,
    "Less than once a month": 0
}
df['Purchase_Frequency_Numeric'] = df['Purchase_Frequency'].map(purchase_frequency_mapping)

# Drop rows with missing target values
df = df.dropna(subset=['Purchase_Frequency_Numeric'])

# Define feature columns
features = [
    'Gender', 'age', 'Browsing_Frequency', 'Product_Search_Method',
    'Search_Result_Exploration', 'Customer_Reviews_Importance',
    'Add_to_Cart_Browsing', 'Cart_Completion_Frequency', 
    'Saveforlater_Frequency', 'Review_Left', 'Review_Reliability',
    'Review_Helpfulness', 'Shopping_Satisfaction'
]

# Ensure categorical features are encoded
label_encoder = LabelEncoder()
for col in features:
    if df[col].dtype == 'object' or isinstance(df[col].iloc[0], str):  # Check for string dtype
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# Define features (X) and target (y)
X = df[features]
y = df['Purchase_Frequency_Numeric']

# Ensure target variable is numeric
y = y.astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best model and parameters
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
y_pred = best_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
class_report = classification_report(y_test, y_pred, target_names=[
    "Less than once a month", "Once a month", "Few times a month"
])
print("Classification Report:")
print(class_report)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Visualize the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[
    "Less than once a month", "Once a month", "Few times a month"
])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Feature importance from the best model
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': best_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importance on Purchase Frequency (After Tuning)')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

# Display feature importance
print(feature_importances)
