# Day 5: Customer Segmentation Assignment - Complete Solution

## Assignment Submission Criteria:

### 1. Gender vs. Spending Score Analysis
- Analyze relationship between 'Gender' and 'Spending Score (1-100)'
- Determine if gender plays significant role in spending habits
- Include visualizations and summary statistics

### 2. Feature Engineering for Clustering
- Create new feature using existing data (Feature Engineering)
- Explore clustering with engineered feature + existing features
- Document feature engineering process
- Determine optimal number of clusters (Elbow Method)
- Provide cluster visualization and interpretation


## Step 1: Import Libraries and Load Data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import scipy.stats as stats
from mpl_toolkits.mplot3d import Axes3D

# Set plotting style
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("All libraries imported successfully!")

In [None]:
# Load the dataset from local file
print("Loading dataset from local file...")
df = pd.read_csv('data/Mall_Customers.csv')

print("Dataset loaded successfully!")
print(f"Data shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

---
# PART 1: GENDER vs SPENDING SCORE ANALYSIS
**Submission Criteria 1:** Analyze the relationship between 'Gender' and 'Spending Score (1-100)' to see if gender plays a significant role in spending habits.

In [None]:
print("=" * 70)
print("           PART 1: GENDER vs SPENDING SCORE ANALYSIS")
print("=" * 70)

# 1. Basic Statistics by Gender
print("\n1. BASIC STATISTICS BY GENDER:")
print("-" * 40)
gender_spending_stats = df.groupby('Gender')['Spending Score (1-100)'].describe()
print(gender_spending_stats)

# 2. Gender Distribution
print("\n2. GENDER DISTRIBUTION:")
print("-" * 25)
gender_counts = df['Gender'].value_counts()
gender_percentages = df['Gender'].value_counts(normalize=True) * 100
print(f"Male: {gender_counts['Male']} customers ({gender_percentages['Male']:.1f}%)")
print(f"Female: {gender_counts['Female']} customers ({gender_percentages['Female']:.1f}%)")

# 3. Statistical Significance Test
print("\n3. STATISTICAL SIGNIFICANCE TEST (T-TEST):")
print("-" * 45)
male_spending = df[df['Gender'] == 'Male']['Spending Score (1-100)']
female_spending = df[df['Gender'] == 'Female']['Spending Score (1-100)']

# Perform independent t-test
t_stat, p_value = stats.ttest_ind(male_spending, female_spending)

print(f"Male Spending - Mean: {male_spending.mean():.2f}, Std: {male_spending.std():.2f}")
print(f"Female Spending - Mean: {female_spending.mean():.2f}, Std: {female_spending.std():.2f}")
print(f"Difference in means: {abs(male_spending.mean() - female_spending.mean()):.2f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Statistically significant (p < 0.05): {'YES' if p_value < 0.05 else 'NO'}")

In [None]:
# Comprehensive Visualizations for Gender vs Spending Score
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Gender vs Spending Score Analysis - Comprehensive Visualizations', fontsize=16, fontweight='bold')

# 1. Box plot
sns.boxplot(data=df, x='Gender', y='Spending Score (1-100)', ax=axes[0,0], 
            palette=['lightblue', 'lightpink'])
axes[0,0].set_title('Spending Score Distribution by Gender\n(Box Plot)', fontsize=12, fontweight='bold')
axes[0,0].set_ylabel('Spending Score (1-100)')

# 2. Violin plot
sns.violinplot(data=df, x='Gender', y='Spending Score (1-100)', ax=axes[0,1], 
               palette=['lightblue', 'lightpink'])
axes[0,1].set_title('Spending Score Density by Gender\n(Violin Plot)', fontsize=12, fontweight='bold')
axes[0,1].set_ylabel('Spending Score (1-100)')

# 3. Histogram comparison
axes[0,2].hist(male_spending, alpha=0.7, label='Male', bins=15, color='lightblue', density=True)
axes[0,2].hist(female_spending, alpha=0.7, label='Female', bins=15, color='lightpink', density=True)
axes[0,2].set_title('Spending Score Distribution\n(Normalized Histogram)', fontsize=12, fontweight='bold')
axes[0,2].set_xlabel('Spending Score (1-100)')
axes[0,2].set_ylabel('Density')
axes[0,2].legend()

# 4. Mean comparison bar plot
gender_means = df.groupby('Gender')['Spending Score (1-100)'].mean()
gender_std = df.groupby('Gender')['Spending Score (1-100)'].std()
bars = axes[1,0].bar(gender_means.index, gender_means.values, 
                     color=['lightblue', 'lightpink'], alpha=0.8,
                     yerr=gender_std.values, capsize=5)
axes[1,0].set_title('Average Spending Score by Gender\n(with Standard Deviation)', fontsize=12, fontweight='bold')
axes[1,0].set_ylabel('Average Spending Score')

# Add value labels on bars
for bar, value in zip(bars, gender_means.values):
    axes[1,0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
                   f'{value:.1f}', ha='center', va='bottom', fontweight='bold')

# 5. Scatter plot: Age vs Spending Score colored by Gender
for gender, color in [('Male', 'lightblue'), ('Female', 'lightpink')]:
    gender_data = df[df['Gender'] == gender]
    axes[1,1].scatter(gender_data['Age'], gender_data['Spending Score (1-100)'], 
                      c=color, label=gender, alpha=0.7, s=50)
axes[1,1].set_title('Age vs Spending Score\n(by Gender)', fontsize=12, fontweight='bold')
axes[1,1].set_xlabel('Age')
axes[1,1].set_ylabel('Spending Score (1-100)')
axes[1,1].legend()

# 6. Income vs Spending Score colored by Gender
for gender, color in [('Male', 'lightblue'), ('Female', 'lightpink')]:
    gender_data = df[df['Gender'] == gender]
    axes[1,2].scatter(gender_data['Annual Income (k$)'], gender_data['Spending Score (1-100)'], 
                      c=color, label=gender, alpha=0.7, s=50)
axes[1,2].set_title('Income vs Spending Score\n(by Gender)', fontsize=12, fontweight='bold')
axes[1,2].set_xlabel('Annual Income (k$)')
axes[1,2].set_ylabel('Spending Score (1-100)')
axes[1,2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Additional Statistical Analysis
print("\n4. ADDITIONAL STATISTICAL ANALYSIS:")
print("-" * 40)

# Mann-Whitney U test (non-parametric alternative)
u_stat, u_p_value = stats.mannwhitneyu(male_spending, female_spending, alternative='two-sided')
print(f"Mann-Whitney U test p-value: {u_p_value:.4f}")

# Effect size (Cohen's d)
pooled_std = np.sqrt(((len(male_spending) - 1) * male_spending.var() + 
                      (len(female_spending) - 1) * female_spending.var()) / 
                     (len(male_spending) + len(female_spending) - 2))
cohens_d = (male_spending.mean() - female_spending.mean()) / pooled_std
print(f"Cohen's d (effect size): {cohens_d:.4f}")

# Interpret effect size
if abs(cohens_d) < 0.2:
    effect_interpretation = "negligible"
elif abs(cohens_d) < 0.5:
    effect_interpretation = "small"
elif abs(cohens_d) < 0.8:
    effect_interpretation = "medium"
else:
    effect_interpretation = "large"

print(f"Effect size interpretation: {effect_interpretation}")

# Confidence interval for the difference in means
diff_mean = male_spending.mean() - female_spending.mean()
se_diff = np.sqrt(male_spending.var()/len(male_spending) + female_spending.var()/len(female_spending))
ci_lower = diff_mean - 1.96 * se_diff
ci_upper = diff_mean + 1.96 * se_diff
print(f"95% Confidence Interval for difference: [{ci_lower:.2f}, {ci_upper:.2f}]")