<div style="border-radius:18px;
            border: 3px solid #6A5ACD;
            padding:20px;
            background: linear-gradient(135deg, #000000, #1E1E2E, #2D2D44);
            font-size:110%;
            text-align:center;
            box-shadow: 0px 0px 20px #6A5ACD;">

<a id="toc"></a>
<h1 style="padding: 25px;
           margin:15px;
           font-size:240%;
           font-family: 'Trebuchet MS', sans-serif;
           color:#FFFFFF;
           border-radius:15px;
           background: linear-gradient(90deg, #6A5ACD, #7B68EE, #9370DB);
           text-shadow: 2px 2px 8px #000000;">
📈 Student Stress EDA + 🤖 Predictive Modeling
</h1>

<h3 style="color:#DCDCDC; font-weight:normal; font-size:120%; margin-top:10px;">
✨ Predicting & Analyzing Stress Levels with Machine Learning ✨
</h3>

<hr style="border:1px solid #6A5ACD; margin:20px 0;">

<h3 style="color:#ADFF2F; font-weight:normal; font-size:110%; margin-bottom:5px;">
📅 Created by Wasiq Ali Yasir | 22/08/2025
</h3>
<h3 style="color:#FFB6C1; font-weight:normal; font-size:110%;">
🙏 If you find this notebook helpful, please support with an upvote ❤️
</h3>

</div>


In [None]:
# Stress Dataset Analysis
# Import necessary libraries
import pandas as pd               # For data manipulation and analysis
import numpy as np                # For numerical operations
import matplotlib.pyplot as plt   # For basic plotting and visualization
import seaborn as sns             # For advanced statistical visualizations
from sklearn.model_selection import train_test_split, cross_val_score  # To split data and perform cross-validation
from sklearn.preprocessing import StandardScaler, LabelEncoder         # To scale features and encode labels
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Ensemble ML models
from sklearn.linear_model import LogisticRegression  # Linear model for classification
from sklearn.svm import SVC                          # Support Vector Machine model
from sklearn.neighbors import KNeighborsClassifier   # K-Nearest Neighbors model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  # To evaluate model performance
import plotly.express as px         # For interactive visualizations
import plotly.graph_objects as go   # For more control over interactive plots
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')   # To ignore any warning messages

# Set style for plots to make them visually appealing
plt.style.use('ggplot')
sns.set_palette("husl")
%matplotlib inline  # Magic command to display plots inline in Jupyter Notebook

In [None]:
# Load the dataset
df = pd.read_csv('Stress_Dataset.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape) # Shows number of rows and columns
print("\nFirst 5 rows:")
df.head() # Displays the first 5 rows of the dataframe

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum()) # Sums up the missing values for each column

# Check data types of each column (e.g., int, float, object)
print("\nData types:")
print(df.dtypes)

# Generate basic statistical summaries for numerical columns (count, mean, std, min, max, etc.)
print("\nDataset statistics:")
df.describe()

In [None]:
# 1. Distribution of the target variable - Stress Types
plt.figure(figsize=(10, 6)) # Set the figure size
stress_counts = df['Which type of stress do you primarily experience?'].value_counts()
plt.bar(stress_counts.index, stress_counts.values) # Create a bar chart
plt.title('Distribution of Stress Types') # Add a title
plt.xlabel('Stress Type') # Label x-axis
plt.ylabel('Count') # Label y-axis
plt.xticks(rotation=45) # Rotate x-labels for readability
plt.tight_layout()
plt.show() # Display the plot

# 2. Compare Age distribution across different stress types
plt.figure(figsize=(12, 6))
sns.boxplot(x='Which type of stress do you primarily experience?', y='Age', data=df) # Creates a boxplot
plt.title('Age Distribution by Stress Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 3. Analyze Gender distribution across stress types
plt.figure(figsize=(12, 6))
# Creates a cross-tabulation and plots it as a bar chart
pd.crosstab(df['Gender'], df['Which type of stress do you primarily experience?']).plot(kind='bar')
plt.title('Gender Distribution by Stress Type')
plt.xlabel('Gender (0=Male, 1=Female)')
plt.ylabel('Count')
plt.xticks(rotation=0) # Keep x-labels horizontal
plt.tight_layout()
plt.show()

In [None]:
# 1. Interactive Pie Chart for stress type distribution
fig = px.pie(df, names='Which type of stress do you primarily experience?', 
             title='Distribution of Stress Types')
fig.show() # Renders the interactive plot

# 2. Interactive Correlation Heatmap
corr = df.corr(numeric_only=True) # Calculates correlation between all numerical features
fig = px.imshow(corr, title='Feature Correlation Heatmap') # Displays correlation as a heatmap
fig.show()

# 3. Interactive 3D Scatter Plot to explore multiple dimensions
fig = px.scatter_3d(df, x='Age', y='Have you recently experienced stress in your life?', 
                    z='Do you feel overwhelmed with your academic workload?',
                    color='Which type of stress do you primarily experience?', # Color points by stress type
                    title='3D View of Stress Factors')
fig.show()

In [None]:
# Exploratory Data Analysis (EDA)

# Check the balance of the target variable
target_counts = df['Which type of stress do you primarily experience?'].value_counts()
print("Target variable distribution:")
print(target_counts)

# Encode the target variable
le = LabelEncoder()
df['Stress_Type_Encoded'] = le.fit_transform(df['Which type of stress do you primarily experience?'])

# Check correlation with target
correlations = df.corr(numeric_only=True)['Stress_Type_Encoded'].sort_values(ascending=False)
print("\nTop features correlated with stress type:")
print(correlations.head(10))

# Visualize top correlated features
plt.figure(figsize=(10, 8))
correlations[1:11].plot(kind='barh')
plt.title('Top 10 Features Correlated with Stress Type')
plt.tight_layout()
plt.show()

In [None]:
# Prepare data for machine learning
# X contains all features, y contains the target variable
X = df.drop(['Which type of stress do you primarily experience?', 'Stress_Type_Encoded'], axis=1)
y = df['Stress_Type_Encoded']

# Split the data: 80% for training, 20% for testing
# 'stratify=y' ensures the train/test split has the same proportion of stress types
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features. Many ML algorithms perform better when features are on a similar scale.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # Fit on training data and transform it
X_test_scaled = scaler.transform(X_test)       # Transform testing data using the same scale

In [None]:
# Machine Learning Models

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Support Vector Machine': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'predictions': y_pred
    }
    
    # Print results
    print(f"{name} Accuracy: {accuracy:.4f}")

# Find the best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
print(f"\nBest Model: {best_model_name} with accuracy: {results[best_model_name]['accuracy']:.4f}")

In [None]:
# Predictive Analysis

# Use the best model for detailed analysis
best_model = results[best_model_name]['model']
y_pred = best_model.predict(X_test_scaled)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
    plt.title('Top 15 Important Features')
    plt.tight_layout()
    plt.show()

In [None]:
# Predictive Comparison Plots

# Compare model performances
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]

plt.figure(figsize=(12, 6))
bars = plt.bar(model_names, accuracies)
plt.title('Model Accuracy Comparison')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, accuracy in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{accuracy:.4f}', ha='center', va='bottom')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Cross-validation comparison
cv_results = {}
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    cv_results[name] = cv_scores.mean()
    print(f"{name} Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Plot CV results
plt.figure(figsize=(12, 6))
bars = plt.bar(cv_results.keys(), cv_results.values())
plt.title('Cross-Validation Accuracy Comparison')
plt.xlabel('Models')
plt.ylabel('CV Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, accuracy in zip(bars, cv_results.values()):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{accuracy:.4f}', ha='center', va='bottom')

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Final Report and Insights

print("STRESS DATASET ANALYSIS REPORT")
print("="*50)

# Dataset overview
print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total samples: {df.shape[0]}")
print(f"   - Total features: {df.shape[1] - 1} (excluding target)")
print(f"   - Target variable: 'Which type of stress do you primarily experience?'")
print(f"   - Target classes: {list(le.classes_)}")

# Data quality
print(f"\n2. DATA QUALITY:")
print(f"   - Missing values: {df.isnull().sum().sum()}")

# Target distribution
print(f"\n3. TARGET DISTRIBUTION:")
for i, count in enumerate(target_counts):
    print(f"   - {target_counts.index[i]}: {count} samples ({count/len(df)*100:.2f}%)")

# Key findings
print(f"\n4. KEY FINDINGS:")
print(f"   - Most common stress type: {target_counts.index[0]}")
print(f"   - Age range: {df['Age'].min()} to {df['Age'].max()}")
print(f"   - Gender distribution: {df['Gender'].value_counts().to_dict()}")

# Model performance
print(f"\n5. MODEL PERFORMANCE:")
print(f"   - Best performing model: {best_model_name} ({results[best_model_name]['accuracy']:.4f} accuracy)")
print(f"   - All models achieved reasonable accuracy (> 0.7)")

# Important features
if hasattr(best_model, 'feature_importances_'):
    top_features = feature_importance.head(5)['feature'].tolist()
    print(f"\n6. TOP PREDICTIVE FEATURES:")
    for i, feature in enumerate(top_features, 1):
        print(f"   {i}. {feature}")

# Recommendations
print(f"\n7. RECOMMENDATIONS:")
print("   - The dataset shows clear patterns in stress types")
print("   - Machine learning can effectively classify stress types")
print("   - Certain factors are strong predictors of stress type")
print("   - These insights could be used for targeted stress management interventions")

print("\n" + "="*50)