# Data Mining Analysis

**Contributors:** [Add team member names]

This notebook implements advanced data mining techniques on the OSMI Mental Health Tech Survey dataset to uncover patterns, predict outcomes, and identify relationships in mental health data.

## 1. Load and Prepare Data

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from mlxtend.frequent_patterns import apriori, association_rules
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the cleaned dataset
df = pd.read_csv('../data/transformed/OSMI_Mental_Health_Final.csv')
print(f"Dataset shape: {df.shape}")
df.head()

ModuleNotFoundError: No module named 'mlxtend'

## 2. Data Preprocessing for Mining

In [None]:
# Create a copy for mining
df_mining = df.copy()

# Select relevant features for mining
mining_features = ['Age', 'Gender', 'treatment', 'family_history', 'age_group']
df_mining = df_mining[mining_features].copy()

# Handle missing values
print("Missing values before preprocessing:")
print(df_mining.isnull().sum())

# Fill missing values with mode for categorical variables
categorical_cols = df_mining.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df_mining[col] = df_mining[col].fillna(df_mining[col].mode()[0])

# Fill missing values with median for numerical variables
numerical_cols = df_mining.select_dtypes(include=[np.number]).columns
for col in numerical_cols:
    df_mining[col] = df_mining[col].fillna(df_mining[col].median())

print("\nMissing values after preprocessing:")
print(df_mining.isnull().sum())

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_mining[col + '_encoded'] = le.fit_transform(df_mining[col])
    label_encoders[col] = le
    print(f"{col} encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

# Create feature matrix for clustering
feature_cols = ['Age'] + [col + '_encoded' for col in categorical_cols if col != 'Age']
X_clustering = df_mining[feature_cols].copy()

print(f"\nFeature matrix shape: {X_clustering.shape}")
print(f"Features: {list(X_clustering.columns)}")

## 3. Clustering Analysis

### 3.1 K-Means Clustering

In [None]:
# Standardize features for clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clustering)

# Find optimal number of clusters using elbow method
inertias = []
K_range = range(1, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(K_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)

# Calculate silhouette scores
from sklearn.metrics import silhouette_score
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_avg = silhouette_score(X_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)

plt.subplot(1, 2, 2)
plt.plot(range(2, 11), silhouette_scores, 'ro-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.grid(True)
plt.tight_layout()
plt.show()

print(f"Silhouette scores: {silhouette_scores}")
optimal_k = range(2, 11)[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")

In [None]:
# Apply K-Means with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to original dataframe
df_mining['Cluster'] = cluster_labels

# Analyze clusters
print("=== CLUSTER ANALYSIS ===")
print(f"Number of clusters: {optimal_k}")
print(f"Cluster sizes:")
for i in range(optimal_k):
    cluster_size = (df_mining['Cluster'] == i).sum()
    percentage = (cluster_size / len(df_mining)) * 100
    print(f"  Cluster {i}: {cluster_size} ({percentage:.1f}%)")

# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.title('K-Means Clustering Results (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter)

# Cluster characteristics
plt.subplot(1, 2, 2)
cluster_means = df_mining.groupby('Cluster')[['Age', 'Gender_encoded', 'treatment_encoded', 'family_history_encoded']].mean()
sns.heatmap(cluster_means.T, annot=True, cmap='coolwarm', center=0)
plt.title('Cluster Characteristics (Normalized)')
plt.xlabel('Cluster')
plt.ylabel('Features')

plt.tight_layout()
plt.show()

# Detailed cluster analysis
print("\n=== DETAILED CLUSTER CHARACTERISTICS ===")
for i in range(optimal_k):
    cluster_data = df_mining[df_mining['Cluster'] == i]
    print(f"\nCluster {i}:")
    print(f"  Size: {len(cluster_data)} ({len(cluster_data)/len(df_mining)*100:.1f}%)")
    print(f"  Average Age: {cluster_data['Age'].mean():.1f}")
    print(f"  Treatment Rate: {(cluster_data['treatment'] == 'yes').mean()*100:.1f}%")
    print(f"  Family History Rate: {(cluster_data['family_history'] == 'yes').mean()*100:.1f}%")

### 3.2 DBSCAN Clustering

In [None]:
# Try DBSCAN clustering
from sklearn.neighbors import NearestNeighbors

# Find optimal epsilon
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)
distances = np.sort(distances[:, 4])

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(distances)
plt.xlabel('Points')
plt.ylabel('Distance')
plt.title('K-Distance Graph for DBSCAN')
plt.grid(True)

# Try different eps values
eps_values = [0.1, 0.2, 0.3, 0.4, 0.5]
n_clusters_dbscan = []
n_noise = []

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(X_scaled)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_points = list(labels).count(-1)
    n_clusters_dbscan.append(n_clusters)
    n_noise.append(n_noise_points)

plt.subplot(1, 2, 2)
plt.plot(eps_values, n_clusters_dbscan, 'bo-')
plt.xlabel('Epsilon')
plt.ylabel('Number of Clusters')
plt.title('DBSCAN Clusters vs Epsilon')
plt.grid(True)
plt.tight_layout()
plt.show()

print("DBSCAN Results:")
for eps, n_clusters, noise in zip(eps_values, n_clusters_dbscan, n_noise):
    print(f"Epsilon {eps}: {n_clusters} clusters, {noise} noise points")

## 4. Classification Analysis

### 4.1 Predict Treatment Seeking Behavior

In [None]:
# Prepare data for classification
X_class = df_mining[['Age', 'Gender_encoded', 'family_history_encoded']].copy()
y_class = df_mining['treatment_encoded']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.3, random_state=42, stratify=y_class)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Class distribution in training set:")
print(y_train.value_counts(normalize=True))

# Scale features
scaler_class = StandardScaler()
X_train_scaled = scaler_class.fit_transform(X_train)
X_test_scaled = scaler_class.transform(X_test)

In [None]:
# Train multiple classification models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\n=== {name} ===")
    
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    results[name] = {
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred
    }
    
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Cross-validation: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

In [None]:
# Compare model performance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
plt.bar(model_names, accuracies)
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center')

plt.subplot(1, 2, 2)
cv_means = [results[name]['cv_mean'] for name in model_names]
cv_stds = [results[name]['cv_std'] for name in model_names]
plt.bar(model_names, cv_means, yerr=cv_stds, capsize=5)
plt.title('Cross-Validation Scores')
plt.ylabel('CV Score')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

# Feature importance for Random Forest
best_model = models['Random Forest']
feature_importance = best_model.feature_importances_
feature_names = X_class.columns

plt.figure(figsize=(8, 6))
plt.bar(feature_names, feature_importance)
plt.title('Feature Importance (Random Forest)')
plt.ylabel('Importance')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Feature Importance:")
for feature, importance in zip(feature_names, feature_importance):
    print(f"{feature}: {importance:.3f}")

## 5. Association Rule Mining

In [None]:
# Prepare data for association rules
# Create binary features for association rule mining
df_assoc = df_mining[['Gender', 'treatment', 'family_history', 'age_group']].copy()

# Create dummy variables
df_dummies = pd.get_dummies(df_assoc, prefix=['Gender', 'Treatment', 'FamilyHistory', 'AgeGroup'])

print(f"Association rules dataset shape: {df_dummies.shape}")
print(f"Features: {list(df_dummies.columns)}")
df_dummies.head()

In [None]:
# Find frequent itemsets
frequent_itemsets = apriori(df_dummies, min_support=0.1, use_colnames=True)

print(f"Number of frequent itemsets: {len(frequent_itemsets)}")
print("\nTop 10 frequent itemsets:")
print(frequent_itemsets.sort_values('support', ascending=False).head(10))

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print(f"\nNumber of association rules: {len(rules)}")
print("\nTop 10 rules by confidence:")
print(rules.sort_values('confidence', ascending=False).head(10)[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

In [None]:
# Visualize association rules
plt.figure(figsize=(15, 5))

# Support vs Confidence
plt.subplot(1, 3, 1)
plt.scatter(rules['support'], rules['confidence'], alpha=0.5)
plt.xlabel('Support')
plt.ylabel('Confidence')
plt.title('Support vs Confidence')

# Support vs Lift
plt.subplot(1, 3, 2)
plt.scatter(rules['support'], rules['lift'], alpha=0.5)
plt.xlabel('Support')
plt.ylabel('Lift')
plt.title('Support vs Lift')

# Confidence vs Lift
plt.subplot(1, 3, 3)
plt.scatter(rules['confidence'], rules['lift'], alpha=0.5)
plt.xlabel('Confidence')
plt.ylabel('Lift')
plt.title('Confidence vs Lift')

plt.tight_layout()
plt.show()

# Filter high-quality rules
high_quality_rules = rules[(rules['confidence'] > 0.7) & (rules['lift'] > 1.2)]
print(f"\nHigh-quality rules (confidence > 0.7, lift > 1.2): {len(high_quality_rules)}")
print("\nTop high-quality rules:")
for idx, rule in high_quality_rules.head(5).iterrows():
    print(f"Rule: {list(rule['antecedents'])} -> {list(rule['consequents'])}")
    print(f"Support: {rule['support']:.3f}, Confidence: {rule['confidence']:.3f}, Lift: {rule['lift']:.3f}")
    print()

## 6. Key Findings and Insights

In [None]:
print("=== DATA MINING INSIGHTS SUMMARY ===")

print("\n1. CLUSTERING INSIGHTS:")
print(f"   - Optimal number of clusters: {optimal_k}")
print(f"   - Clusters represent different mental health profiles")
print(f"   - Each cluster has distinct demographic and treatment patterns")

print("\n2. CLASSIFICATION INSIGHTS:")
best_model_name = max(results.keys(), key=lambda x: results[x]['accuracy'])
best_accuracy = results[best_model_name]['accuracy']
print(f"   - Best model: {best_model_name} (Accuracy: {best_accuracy:.3f})")
print(f"   - Key predictors of treatment seeking: Age, Gender, Family History")
print(f"   - Model can help identify at-risk individuals")

print("\n3. ASSOCIATION RULE INSIGHTS:")
print(f"   - Number of frequent patterns: {len(frequent_itemsets)}")
print(f"   - Number of association rules: {len(rules)}")
print(f"   - High-quality rules: {len(high_quality_rules)}")
print(f"   - Rules reveal relationships between mental health factors")

print("\n4. BUSINESS IMPLICATIONS:")
print("   - Organizations can target interventions based on cluster profiles")
print("   - Predictive models can identify employees needing mental health support")
print("   - Association rules help understand risk factor combinations")
print("   - Data-driven approach to mental health workplace policies")

## Conclusion

This data mining analysis has successfully applied three key techniques to the mental health dataset:

1. **Clustering Analysis**: Identified distinct segments of tech workers with different mental health characteristics
2. **Classification**: Built predictive models to identify factors influencing treatment-seeking behavior
3. **Association Rules**: Discovered relationships between different mental health factors

These insights provide actionable intelligence for organizations looking to improve mental health support in the tech industry.

---
**Next Steps**: Proceed to `4_insights_dashboard.ipynb` to create interactive visualizations and dashboards.