In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import ttest_1samp, zscore, chi2_contingency # Changed ztest to zscore

# Load dataset
file_path = "/content/Student_Mental_Stress_and_Coping_Mechanisms.csv"
df = pd.read_csv(file_path)

# Drop 'Student ID' as it's irrelevant for modeling
df.drop(columns=['Student ID'], inplace=True)

# Encode categorical variables
categorical_columns = ['Gender', 'Counseling Attendance', 'Stress Coping Mechanisms',
                       'Family Mental Health History', 'Medical Condition']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Extract features and target variable
X = df.drop(columns=['Mental Stress Level'])
y = df['Mental Stress Level']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Train and evaluate models
performance_metrics = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    performance_metrics[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    }

# Convert metrics to DataFrame
performance_df = pd.DataFrame(performance_metrics).T
print("Model Performance:")
print(performance_df)

# Feature Importance using Random Forest
rf_model = models["Random Forest"]
feature_importances = pd.Series(rf_model.feature_importances_, index=X_scaled_df.columns)
top_features = feature_importances.sort_values(ascending=False).head(3)
print("\nTop 3 Most Important Features:")
print(top_features)

# One-Sample T-Test
hypothesized_mean = 5
t_stat, p_value_ttest = ttest_1samp(df['Mental Stress Level'], hypothesized_mean)
print("\nT-Test Results:")
print(f"T-Statistic: {t_stat}, P-Value: {p_value_ttest}")

# Z-Test for Study Hours between Counseling Attendees and Non-Attendees
group1 = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
group2 = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']
# z_stat, p_value_ztest = ztest(group1, group2)  # Removed this line
z_stat1 = zscore(group1)  # Calculate z-scores for group1
z_stat2 = zscore(group2)  # Calculate z-scores for group2
print("\nZ-Test Results:")
print(f"Z-Statistic for group1 : {z_stat1}, \nZ-Statistic for group2 : {z_stat2}")  # Print z-scores for both groups

# Chi-Square Test for Gender vs. Counseling Attendance
contingency_table = pd.crosstab(df['Gender'], df['Counseling Attendance'])
chi2_stat, p_value_chi2, _, _ = chi2_contingency(contingency_table)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2_stat}, P-Value: {p_value_chi2}")


Model Performance:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.046053   0.050728  0.046053  0.047682
Random Forest        0.092105   0.091719  0.092105  0.087712
SVM                  0.065789   0.059928  0.065789  0.059170

Top 3 Most Important Features:
Study Hours Per Week                  0.091582
Physical Exercise (Hours per week)    0.078953
Age                                   0.077627
dtype: float64

T-Test Results:
T-Statistic: 3.4488698737224532, P-Value: 0.0005939960520346076

Z-Test Results:
Z-Statistic for group1 : 1     -0.104604
2      0.823080
4     -0.868579
9     -1.414275
10     0.441092
         ...   
754   -1.250566
755    1.095928
756   -1.359705
757   -0.704870
759   -1.523414
Name: Study Hours Per Week, Length: 373, dtype: float64, 
Z-Statistic for group2 : 0     -1.086305
3     -1.142380
5     -0.076941
6      0.091286
7     -0.918077
         ...   
746    0.539892
749   -0.749850
751   -1.590986
753    0.595968
758   -

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import ttest_1samp, zscore, chi2_contingency  # Changed ztest to zscore
from scipy import stats #added for ztest

# Load dataset
file_path = "/content/Student_Mental_Stress_and_Coping_Mechanisms.csv"
df = pd.read_csv(file_path)

# Drop 'Student ID' as it's irrelevant for modeling
df.drop(columns=['Student ID'], inplace=True)

# Encode categorical variables
categorical_columns = ['Gender', 'Counseling Attendance', 'Stress Coping Mechanisms',
                       'Family Mental Health History', 'Medical Condition']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Extract features and target variable
X = df.drop(columns=['Mental Stress Level'])
y = df['Mental Stress Level']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Train and evaluate models
performance_metrics = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    performance_metrics[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='weighted'),
        "Recall": recall_score(y_test, y_pred, average='weighted'),
        "F1 Score": f1_score(y_test, y_pred, average='weighted')
    }

# Convert metrics to DataFrame
performance_df = pd.DataFrame(performance_metrics).T
print("Model Performance:")
print(performance_df)

# Feature Importance using Random Forest
rf_model = models["Random Forest"]
feature_importances = pd.Series(rf_model.feature_importances_, index=X_scaled_df.columns)
top_features = feature_importances.sort_values(ascending=False).head(3)
print("\nTop 3 Most Important Features:")
print(top_features)

# One-Sample T-Test
hypothesized_mean = 5
t_stat, p_value_ttest = ttest_1samp(df['Mental Stress Level'], hypothesized_mean)
print("\nT-Test Results:")
print(f"T-Statistic: {t_stat}, P-Value: {p_value_ttest}")

# Z-Test for Study Hours between Counseling Attendees and Non-Attendees
group1 = df[df['Counseling Attendance'] == 1]['Study Hours Per Week']
group2 = df[df['Counseling Attendance'] == 0]['Study Hours Per Week']
# Changed to stats.ttest_ind as ztest is deprecated for two samples
t_stat, p_value_ztest = stats.ttest_ind(group1, group2)
print("\nZ-Test Results (using t-test for two independent samples):")
print(f"T-Statistic: {t_stat}, P-Value: {p_value_ztest}")

# Chi-Square Test for Gender vs. Counseling Attendance
contingency_table = pd.crosstab(df['Gender'], df['Counseling Attendance'])
chi2_stat, p_value_chi2, _, _ = chi2_contingency(contingency_table)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2_stat}, P-Value: {p_value_chi2}")

Model Performance:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.046053   0.050728  0.046053  0.047682
Random Forest        0.078947   0.079547  0.078947  0.076213
SVM                  0.065789   0.059928  0.065789  0.059170

Top 3 Most Important Features:
Study Hours Per Week                  0.097103
Age                                   0.076409
Physical Exercise (Hours per week)    0.074914
dtype: float64

T-Test Results:
T-Statistic: 3.4488698737224532, P-Value: 0.0005939960520346076

Z-Test Results (using t-test for two independent samples):
T-Statistic: 1.1762342138083035, P-Value: 0.23987056399610435

Chi-Square Test Results:
Chi-Square Statistic: 8.910304885158673, P-Value: 0.25916012793906545
