In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('ggplot')
sns.set_palette("viridis")
%matplotlib inline

# 2. Load dataset
df = pd.read_csv('StressLevelDataset.csv')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nDescriptive Statistics:")
df.describe()

In [None]:
# 3. Data visualization and analysis
# Distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(x='stress_level', data=df)
plt.title('Distribution of Stress Levels')
plt.xlabel('Stress Level')
plt.ylabel('Count')
plt.show()

# Correlation matrix
plt.figure(figsize=(16, 12))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

# Top features correlated with stress level
stress_corr = corr_matrix['stress_level'].sort_values(ascending=False)[1:]
plt.figure(figsize=(10, 8))
sns.barplot(x=stress_corr.values, y=stress_corr.index)
plt.title('Features Correlation with Stress Level')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()

In [None]:
# 3.5 Interactive plots
# Interactive correlation matrix
fig = px.imshow(corr_matrix, text_auto=True, aspect="auto", 
                color_continuous_scale='RdBu_r', title='Interactive Correlation Matrix')
fig.show()

# Interactive distribution of stress levels
fig = px.histogram(df, x='stress_level', title='Distribution of Stress Levels', 
                   color_discrete_sequence=px.colors.sequential.Viridis)
fig.show()

# Interactive scatter plot of anxiety vs depression colored by stress level
fig = px.scatter(df, x='anxiety_level', y='depression', color='stress_level',
                 title='Anxiety vs Depression by Stress Level',
                 labels={'anxiety_level': 'Anxiety Level', 'depression': 'Depression'},
                 color_continuous_scale='viridis')
fig.show()

In [None]:
# 4. EDA - Exploratory Data Analysis
# Distribution of numerical features
numerical_features = ['anxiety_level', 'self_esteem', 'depression', 'sleep_quality']
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(numerical_features):
    sns.histplot(df[feature], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature.replace("_", " ").title()}')

plt.tight_layout()
plt.show()

# Box plots for key features by stress level
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

features_to_plot = ['anxiety_level', 'depression', 'self_esteem', 'sleep_quality']
for i, feature in enumerate(features_to_plot):
    sns.boxplot(x='stress_level', y=feature, data=df, ax=axes[i])
    axes[i].set_title(f'{feature.replace("_", " ").title()} by Stress Level')

plt.tight_layout()
plt.show()

# Pairplot of selected features
sns.pairplot(df[['anxiety_level', 'self_esteem', 'depression', 'sleep_quality', 'stress_level']], 
             hue='stress_level', palette='viridis')
plt.suptitle('Pairplot of Key Features by Stress Level', y=1.02)
plt.show()

In [None]:
# Check for class imbalance
stress_counts = df['stress_level'].value_counts()
print("Class distribution:")
print(stress_counts)

# Prepare data for modeling
X = df.drop('stress_level', axis=1)
y = df['stress_level']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# 5. All ML models
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Train model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_pred': y_pred
    }
    
    print(f"{name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  CV Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print()

In [None]:
# 6. Predictive analysis
# Compare model performance
model_comparison = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[name]['accuracy'] for name in results.keys()],
    'CV Mean Score': [results[name]['cv_mean'] for name in results.keys()],
    'CV Std': [results[name]['cv_std'] for name in results.keys()]
}).sort_values('Accuracy', ascending=False)

print("Model Comparison:")
print(model_comparison)

# Best model
best_model_name = model_comparison.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\nBest Model: {best_model_name}")

# Detailed evaluation of best model
y_pred_best = results[best_model_name]['y_pred']
print("\nClassification Report for Best Model:")
print(classification_report(y_test, y_pred_best))

# Confusion matrix for best model
cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# 7. Predictive comparison plots or result
# Accuracy comparison plot
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Accuracy', data=model_comparison)
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylim(0.8, 1.0)
plt.tight_layout()
plt.show()

# Cross-validation scores comparison
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='CV Mean Score', data=model_comparison)
plt.title('Model Cross-Validation Score Comparison')
plt.xticks(rotation=45)
plt.ylim(0.8, 1.0)
plt.tight_layout()
plt.show()

# Interactive comparison plot
fig = go.Figure()
fig.add_trace(go.Bar(
    x=model_comparison['Model'],
    y=model_comparison['Accuracy'],
    name='Accuracy',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=model_comparison['Model'],
    y=model_comparison['CV Mean Score'],
    name='CV Mean Score',
    marker_color='lightsalmon'
))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_tickangle=-45,
    barmode='group'
)
fig.show()

# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance - Random Forest')
    plt.tight_layout()
    plt.show()

In [None]:
# 8. Report
print("="*50)
print("STRESS LEVEL PREDICTION - ANALYSIS REPORT")
print("="*50)

print(f"\nDataset Overview:")
print(f"- Total samples: {df.shape[0]}")
print(f"- Number of features: {df.shape[1]}")
print(f"- Target variable: stress_level")
print(f"- Class distribution:")
for level, count in stress_counts.items():
    print(f"  Level {level}: {count} samples ({count/len(df)*100:.1f}%)")

print(f"\nKey Findings from EDA:")
print("- Anxiety level and depression show strong positive correlation with stress level")
print("- Self-esteem shows strong negative correlation with stress level")
print("- Sleep quality is moderately negatively correlated with stress level")
print("- Mental health history is positively correlated with stress level")

print(f"\nModel Performance Summary:")
print(f"- Best performing model: {best_model_name}")
print(f"- Best model accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"- Best model cross-validation score: {results[best_model_name]['cv_mean']:.4f}")

print(f"\nTop 5 Most Important Features:")
if 'feature_importance' in locals():
    for i, row in feature_importance.head().iterrows():
        print(f"  {i+1}. {row['feature']}: {row['importance']:.4f}")
else:
    print("  (Feature importance not available for the best model)")

print(f"\nRecommendations:")
print("- Focus on interventions that reduce anxiety and depression")
print("- Programs to improve self-esteem may help reduce stress levels")
print("- Promoting better sleep quality could be beneficial")
print("- Those with mental health history may need additional support")

print(f"\nLimitations:")
print("- Dataset may have class imbalance issues")
print("- Model performance might vary with different data splits")
print("- Real-world application would require continuous model evaluation")