In [32]:
import kagglehub
import pandas as pd
import os
path = kagglehub.dataset_download("salahuddinahmedshuvo/student-mental-stress-and-coping-mechanisms")
print("Path to dataset files:", path)
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(path, filename)
        break
df = pd.read_csv(csv_file_path)
df

Path to dataset files: /root/.cache/kagglehub/datasets/salahuddinahmedshuvo/student-mental-stress-and-coping-mechanisms/versions/1


Unnamed: 0,Student ID,Age,Gender,Academic Performance (GPA),Study Hours Per Week,Social Media Usage (Hours per day),Sleep Duration (Hours per night),Physical Exercise (Hours per week),Family Support,Financial Stress,Peer Pressure,Relationship Stress,Mental Stress Level,Counseling Attendance,Diet Quality,Stress Coping Mechanisms,Cognitive Distortions,Family Mental Health History,Medical Condition,Substance Use
0,802-17-3671,22,Female,2,9,2,12,2,1,1,3,5,9,No,1,Walking or Nature Walks,4,No,Yes,1
1,871-12-8572,25,Female,0,28,0,6,0,1,1,1,2,9,Yes,3,Meditation,2,Yes,No,1
2,495-13-2672,24,Female,0,45,3,12,10,3,3,1,4,9,Yes,5,Reading,1,Yes,Yes,3
3,365-77-2496,20,Male,2,8,7,7,4,1,3,2,5,1,No,1,Social Media Engagement,2,Yes,No,4
4,664-76-5622,28,Male,0,14,6,8,1,2,4,4,2,7,Yes,1,Exercise,1,Yes,No,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,826-89-7993,24,Genderfluid,2,50,4,11,8,5,2,4,5,2,Yes,1,Exercise,3,Yes,No,2
756,681-15-6754,23,Female,3,5,6,12,7,1,4,2,1,1,Yes,2,Meditation,4,Yes,No,1
757,714-33-5373,23,Male,1,17,3,11,5,2,4,3,3,9,Yes,5,Social Media Engagement,4,Yes,No,3
758,849-54-1252,29,Female,1,15,8,10,4,2,1,2,2,2,No,4,Meditation,4,No,Yes,3


In [33]:
df.dropna(inplace=True)
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Encode categorical variables
label_encoders = {}
categorical_columns = ['Gender', 'Counseling Attendance', 'Stress Coping Mechanisms',
                       'Family Mental Health History', 'Medical Condition']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df['Mental Stress Level'] = df['Mental Stress Level'].apply(lambda x: 1 if x > 5 else 0)


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
# Select features and target variable
X = df.drop(columns=['Student ID', 'Mental Stress Level'])
y = df['Mental Stress Level']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train and evaluate models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear')
}
performance_metrics = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    performance_metrics[name] = {'Accuracy': acc, 'Precision': precision, 'Recall': recall, 'F1-score': f1}
    print(f'\n{name} Performance:\n')
    print(f'Accuracy    : {acc:.4f}')
    print(classification_report(y_test, y_pred))


Logistic Regression Performance:

Accuracy    : 0.4803
              precision    recall  f1-score   support

           0       0.52      0.54      0.53        82
           1       0.43      0.41      0.42        70

    accuracy                           0.48       152
   macro avg       0.48      0.48      0.48       152
weighted avg       0.48      0.48      0.48       152


Random Forest Performance:

Accuracy    : 0.5132
              precision    recall  f1-score   support

           0       0.54      0.63      0.58        82
           1       0.46      0.37      0.41        70

    accuracy                           0.51       152
   macro avg       0.50      0.50      0.50       152
weighted avg       0.51      0.51      0.51       152


SVM Performance:

Accuracy    : 0.4934
              precision    recall  f1-score   support

           0       0.53      0.52      0.53        82
           1       0.45      0.46      0.45        70

    accuracy                        

In [35]:
# Feature Importance from Random Forest
rf_model = models['Random Forest']
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print('\nTop 3 Most Important Features:')
print(feature_importance_df.head(3))


Top 3 Most Important Features:
                              Feature  Importance
3                Study Hours Per Week    0.126116
6  Physical Exercise (Hours per week)    0.082010
0                                 Age    0.077096


In [36]:
from sklearn.decomposition import PCA
# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
print('\nExplained Variance by Principal Components:', explained_variance)


Explained Variance by Principal Components: [0.07053338 0.0684774 ]


In [37]:
!pip install statsmodels
from statsmodels.stats.weightstats import ztest
from scipy.stats import chi2_contingency, ttest_1samp # Import chi2_contingency and ttest_1samp


# One-Sample T-Test with hypothesized mean (5)
stress_levels = df['Mental Stress Level']
t_stat, p_value = ttest_1samp(stress_levels, 5)
print('\nT-test Results:')
print(f'T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}')
if p_value < 0.05:
    print('Reject the null hypothesis: The mental stress levels significantly differ from 5.')
else:
    print('Fail to reject the null hypothesis: No significant difference in stress levels.')

# Z-test for study hours between students who attended counseling and those who did not
counseling_yes = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
counseling_no = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']
z_stat, p_value_z = ztest(counseling_yes, counseling_no)
print('\nZ-test Results:')
print(f'Z-statistic: {z_stat:.4f}, P-value: {p_value_z:.4f}')
if p_value_z < 0.05:
    print('Significant difference in study hours between students who attended counseling and those who did not.')
else:
    print('No significant difference in study hours based on counseling attendance.')

# Chi-square test for relationship between gender and counseling attendance
contingency_table = pd.crosstab(df['Gender'], df['Counseling Attendance'])
chi2_stat, p_value_chi, _, _ = chi2_contingency(contingency_table)
print('\nChi-square Test Results:')
print(f'Chi-square statistic: {chi2_stat:.4f}, P-value: {p_value_chi:.4f}')
if p_value_chi < 0.05:
    print('Significant relationship between gender and counseling attendance.')
else:
    print('No significant relationship between gender and counseling attendance.')


T-test Results:
T-statistic: -249.7457, P-value: 0.0000
Reject the null hypothesis: The mental stress levels significantly differ from 5.

Z-test Results:
Z-statistic: 1.1762, P-value: 0.2395
No significant difference in study hours based on counseling attendance.

Chi-square Test Results:
Chi-square statistic: 8.9103, P-value: 0.2592
No significant relationship between gender and counseling attendance.
