<h1 style="color: brown;">1. Importing Libraries and Configuring Display Options</h1>

In [1]:
# Data manipulation and visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# Evaluation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss
from sklearn.model_selection import cross_validate
import time
from itertools import product

## Customizing DataFrame Output Formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

<h1 style="color: brown;">2. Exploring Dataset</h1>

<h2 style="color: purple;">2.1 Loading Mental Health Dataset</h2>

In [2]:
df = pd.read_csv('mentalhealth_dataset.csv')

<h2 style="color: purple;">2.2 Displaying Dataset Dimensions</h2> 

In [3]:
print(f'The data has {df.shape[0]} rows and {df.shape[1]} columns.')

The data has 1000 rows and 16 columns.


<h2 style="color: purple;">2.3 Displaying All Column Names</h2>

In [4]:
# Get all column names from df
all_column_names = df.columns

# Display columns with numbered list
print("All Columns:")
for idx, col in enumerate(all_column_names, start=1):
    print(f"{idx}. {col}")


All Columns:
1. Timestamp
2. Gender
3. Age
4. Course
5. YearOfStudy
6. CGPA
7. Depression
8. Anxiety
9. PanicAttack
10. SpecialistTreatment
11. SymptomFrequency_Last7Days
12. HasMentalHealthSupport
13. SleepQuality
14. StudyStressLevel
15. StudyHoursPerWeek
16. AcademicEngagement


<h2 style="color: purple;">2.4 Identify Numeric and Non-Numeric Features</h2>  

In [5]:
numeric_column_names = df.select_dtypes(include='number').columns
non_numeric_column_names = df.select_dtypes(exclude='number').columns

# Organize the columns in a dictionary for clear display
columns_dict = {
    "Numeric Columns": list(numeric_column_names),
    "Non-Numeric Columns": list(non_numeric_column_names)
}

# Print dictionary in a readable format with numbered lists and unique value counts
for category, columns in columns_dict.items():
    print(f"{category}:")
    for idx, col in enumerate(columns, start=1):
        unique_values_count = df[col].nunique()  # Count unique values
        print(f"  {idx}. {col} (Unique Values: {unique_values_count})")
    print()  # Add an empty line between categories


Numeric Columns:
  1. Age (Unique Values: 8)
  2. CGPA (Unique Values: 187)
  3. Depression (Unique Values: 2)
  4. Anxiety (Unique Values: 2)
  5. PanicAttack (Unique Values: 2)
  6. SpecialistTreatment (Unique Values: 2)
  7. SymptomFrequency_Last7Days (Unique Values: 8)
  8. HasMentalHealthSupport (Unique Values: 2)
  9. SleepQuality (Unique Values: 5)
  10. StudyStressLevel (Unique Values: 5)
  11. StudyHoursPerWeek (Unique Values: 19)
  12. AcademicEngagement (Unique Values: 5)

Non-Numeric Columns:
  1. Timestamp (Unique Values: 16)
  2. Gender (Unique Values: 2)
  3. Course (Unique Values: 49)
  4. YearOfStudy (Unique Values: 7)



<h2 style="color: purple;">2.5 Displaying First 5 Rows of the Dataset</h2>  

In [6]:
pd.set_option('display.max_columns', None)
df.head(5)

Unnamed: 0,Timestamp,Gender,Age,Course,YearOfStudy,CGPA,Depression,Anxiety,PanicAttack,SpecialistTreatment,SymptomFrequency_Last7Days,HasMentalHealthSupport,SleepQuality,StudyStressLevel,StudyHoursPerWeek,AcademicEngagement
0,13/7/2020,Female,24,Biotechnology,Year 3,2.38,1,0,0,0,5,0,4,5,8,2
1,13/7/2020,Female,18,Biotechnology,Year 3,4.0,0,1,0,0,0,0,4,4,13,5
2,13/7/2020,Female,25,Biotechnology,Year 3,3.68,0,0,1,0,3,0,1,2,13,1
3,13/7/2020,Female,18,Engineering,year 4,4.0,0,0,0,0,3,0,5,1,19,2
4,13/7/2020,Female,20,Engineering,year 4,2.0,1,1,0,0,0,0,2,4,3,2


<h2 style="color: purple;">2.6 Checking for Missing Data</h2>  

In [7]:
print(f'Total number of missing values: {df.isnull().sum().sum()}')

Total number of missing values: 0


<h2 style="color: purple;">2.7 Checking for Duplicate Data</h2> 

In [8]:
print(f'There are {df.duplicated().sum()} duplicate rows in the dataset.')

There are 0 duplicate rows in the dataset.


<h2 style="color: purple;">2.8 Summary Statistics for Numeric Columns</h2> 

In [9]:
pd.set_option('display.float_format', '{:.3f}'.format)
numeric_columns = df.select_dtypes(include='number')
numeric_columns.describe()

Unnamed: 0,Age,CGPA,Depression,Anxiety,PanicAttack,SpecialistTreatment,SymptomFrequency_Last7Days,HasMentalHealthSupport,SleepQuality,StudyStressLevel,StudyHoursPerWeek,AcademicEngagement
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,21.402,3.123,0.483,0.474,0.458,0.067,3.498,0.067,2.983,3.045,9.746,3.055
std,2.374,0.811,0.5,0.5,0.498,0.25,2.308,0.25,1.418,1.417,5.651,1.423
min,18.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,19.0,2.25,0.0,0.0,0.0,0.0,1.75,0.0,2.0,2.0,5.0,2.0
50%,21.0,3.25,0.0,0.0,0.0,0.0,3.0,0.0,3.0,3.0,9.0,3.0
75%,24.0,4.0,1.0,1.0,1.0,0.0,6.0,0.0,4.0,4.0,15.0,4.0
max,25.0,4.0,1.0,1.0,1.0,1.0,7.0,1.0,5.0,5.0,19.0,5.0


<h2 style="color: purple;">2.9 Summary Statistics for Categorical Columns</h2> 

In [10]:
non_numeric_columns = df.select_dtypes(exclude='number')
print("Non-Numeric Columns Summary:")
non_numeric_columns.describe(include='all')

Non-Numeric Columns Summary:


Unnamed: 0,Timestamp,Gender,Course,YearOfStudy
count,1000,1000,1000,1000
unique,16,2,49,7
top,2021-07-08,Female,Engineering,year 1
freq,190,760,180,390


<h2 style="color: purple;">2.10 Displaying Frequency of Categorical Data</h2> 

In [11]:
for col in non_numeric_column_names:
    # Get value counts
    value_counts = df[col].value_counts()

    # Display with a numbered list
    print(f"\nValue counts for {col}:")
    for idx, (value, count) in enumerate(value_counts.items(), start=1):
        print(f"{idx}. {value}: {count}")
    print("-" * 50)



Value counts for Timestamp:
1. 2021-07-08: 190
2. 2020-07-08: 183
3. 2022-07-08: 176
4. 2023-07-08: 176
5. 13/7/2020: 58
6. 13/7/2023: 57
7. 13/7/2022: 54
8. 13/7/2021: 52
9. 2020-07-09: 14
10. 2022-07-09: 14
11. 2021-07-09: 9
12. 2023-07-09: 9
13. 18/7/2023: 4
14. 18/7/2022: 2
15. 18/7/2020: 1
16. 18/7/2021: 1
--------------------------------------------------

Value counts for Gender:
1. Female: 760
2. Male: 240
--------------------------------------------------

Value counts for Course:
1. Engineering: 180
2. BCS: 177
3. BIT: 101
4. KOE: 39
5. Biomedical science: 33
6. Engine: 19
7. Laws: 19
8. psychology: 17
9. BENL: 16
10. CTS: 15
11. Business Administration: 14
12. Koe: 14
13. engin: 14
14. Human Sciences : 13
15. Nursing : 13
16. Law: 13
17. Communication : 13
18. Marine science: 12
19. Psychology: 12
20. Kirkhs: 12
21. Malcom: 12
22. Pendidikan Islam : 12
23. Accounting : 11
24. DIPLOMA TESL: 11
25. Usuluddin : 11
26. Fiqh: 11
27. KIRKHS: 10
28. Irkhs: 10
29. Pendidikan islam:

<h1 style="color: brown;">3. Preprocessing </h1> 

<h2 style="color: purple;">3.1 Dropping Unnecessary Columns</h2>


In [12]:
df = df.drop(columns = ['Timestamp', 'Course'])

<h2 style="color: purple;">3.2 Cleaning and Converting 'YearOfStudy' to Integer</h2> 


In [13]:
df['YearOfStudy'] = df['YearOfStudy'].str.title().str.extract('(\d+)').astype(int)

  df['YearOfStudy'] = df['YearOfStudy'].str.title().str.extract('(\d+)').astype(int)


<h2 style="color: purple;">3.3 Applying Label Encoding to 'Gender' Column</h2>


In [14]:
label_encoder = LabelEncoder()

# Fit and transform the Gender column
df['Gender'] = label_encoder.fit_transform(df['Gender'])

# Display the mapping of labels to values
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Encoding Mapping:", label_mapping)

Label Encoding Mapping: {'Female': 0, 'Male': 1}


<h1 style="color: brown;">4. Development and Assessment of Models</h1> 

<h2 style="color: purple;">4.1 Splitting Features and Target Variable</h2>


In [15]:
X = df.drop(columns=['Depression'])
y = df['Depression']

<h2 style="color: purple;">4.2 Splitting Data into Training and Test Sets</h2> 


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h2 style="color: purple;">4.3 Scaling Features using StandardScaler</h2>

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h1 style="color: brown;">5. Evaluating Models and Measuring Performance</h1>  

<h2 style="color: purple;">5.1 Finding the Best Model</h2> 

In [18]:
model_list = [
    LogisticRegression(max_iter=10000, random_state=42),
    DecisionTreeClassifier(ccp_alpha=0.001, random_state=42),
    RandomForestClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    AdaBoostClassifier(algorithm='SAMME', random_state=42),
    SVC(probability=True, random_state=42),  # probability=True needed for log_loss
    GaussianNB(),
    KNeighborsClassifier(),
]

results = []

for model in model_list:
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    preds_proba = None
    
    # For log_loss we need predicted probabilities, if available
    if hasattr(model, "predict_proba"):
        preds_proba = model.predict_proba(X_test_scaled)
    elif hasattr(model, "decision_function"):
        # Some models like SVC can give decision_function scores; convert to probabilities
        decision_scores = model.decision_function(X_test_scaled)
        from sklearn.preprocessing import softmax
        preds_proba = softmax(decision_scores, axis=1)
    
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds, average='macro')
    prec = precision_score(y_test, preds, average='macro')
    rec = recall_score(y_test, preds, average='macro')
    
    # Compute log loss only if predicted probabilities are available
    if preds_proba is not None:
        ll = log_loss(y_test, preds_proba)
    else:
        ll = np.nan  # not available
    
    results.append((model.__class__.__name__, acc, f1, prec, rec, ll))

# Create DataFrame and sort by accuracy or any metric
df_results = pd.DataFrame(results, columns=["Model", "Accuracy", "F1_macro", "Precision_macro", "Recall_macro", "Log_loss"])
df_results = df_results.sort_values(by="Accuracy", ascending=False)

print(df_results)

                        Model  Accuracy  F1_macro  Precision_macro  \
0          LogisticRegression     0.577     0.576            0.577   
4          AdaBoostClassifier     0.577     0.567            0.584   
3  GradientBoostingClassifier     0.553     0.549            0.555   
1      DecisionTreeClassifier     0.537     0.536            0.537   
2      RandomForestClassifier     0.507     0.497            0.507   
5                         SVC     0.503     0.497            0.504   
6                  GaussianNB     0.500     0.466            0.500   
7        KNeighborsClassifier     0.467     0.465            0.466   

   Recall_macro  Log_loss  
0         0.577     0.674  
4         0.577     0.685  
3         0.553     0.725  
1         0.537    16.700  
2         0.507     0.724  
5         0.503     0.698  
6         0.500     0.918  
7         0.467     1.850  


In [19]:
best_row = df_results.loc[df_results['Accuracy'].idxmax()]

best_model_name = best_row['Model']
best_accuracy = best_row['Accuracy']

print(f"Best model based on accuracy: {best_model_name} with accuracy = {best_accuracy:.4f}")

Best model based on accuracy: LogisticRegression with accuracy = 0.5767


<h2 style="color: purple;">5.2 Manual Grid Search for Best Hyperparameters</h2>  

In [20]:
# Split training data into sub-train and validation sets
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Define grid with default values included
param_grid = {
    "penalty": ["l2", "none"],  # default is "l2"
    "C": [1.0, 0.1, 10],        # default is 1.0
    "solver": ["lbfgs", "liblinear", "saga"],  # default is "lbfgs"
}

# Manual grid search
best_score = -1
best_params = None

for combo in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combo))
    
    # Skip invalid combinations (e.g., 'liblinear' doesn't support 'none' penalty)
    try:
        model = LogisticRegression(max_iter=10000, random_state=42, **params)
        model.fit(X_subtrain, y_subtrain)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        
        if acc > best_score:
            best_score = acc
            best_params = params
    except Exception as e:
        # Skip invalid parameter combinations
        continue

print("Best hyperparameters from manual grid search:", best_params)
print("Validation accuracy with best params:", best_score)


Best hyperparameters from manual grid search: {'penalty': 'l2', 'C': 10, 'solver': 'lbfgs'}
Validation accuracy with best params: 0.5928571428571429


<h2 style="color: purple;">5.3 Cross-Validation Evaluation</h2>   

In [21]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
import numpy as np

# Best parameters from manual grid search
best_params = {
    'penalty': 'l2',
    'C': 10,
    'solver': 'lbfgs'
}

# Initialize final model
final_model = LogisticRegression(**best_params, max_iter=10000, random_state=42)

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'log_loss': 'neg_log_loss'  # note: sklearn returns neg log loss
}

# Perform cross-validation
cv_results = cross_validate(final_model, X_train_scaled, y_train, cv=5, scoring=scoring)

# Extract and convert metrics
accuracy_scores = cv_results['test_accuracy']
f1_scores = cv_results['test_f1_macro']
precision_scores = cv_results['test_precision_macro']
recall_scores = cv_results['test_recall_macro']
log_loss_scores = -cv_results['test_log_loss']  # convert to positive

# Print individual and mean scores
print("CV Accuracy scores:", accuracy_scores)
print("Mean CV Accuracy:", np.mean(accuracy_scores))

print("CV F1 (macro) scores:", f1_scores)
print("Mean CV F1 (macro):", np.mean(f1_scores))

print("CV Precision (macro) scores:", precision_scores)
print("Mean CV Precision (macro):", np.mean(precision_scores))

print("CV Recall (macro) scores:", recall_scores)
print("Mean CV Recall (macro):", np.mean(recall_scores))

print("CV Log Loss scores:", log_loss_scores)
print("Mean CV Log Loss:", np.mean(log_loss_scores))


CV Accuracy scores: [0.60714286 0.61428571 0.55714286 0.6        0.6       ]
Mean CV Accuracy: 0.5957142857142858
CV F1 (macro) scores: [0.59810011 0.61230769 0.54969911 0.5986896  0.59166667]
Mean CV F1 (macro): 0.5900926349943423
CV Precision (macro) scores: [0.60882867 0.61327561 0.55508021 0.5986896  0.59862188]
Mean CV Precision (macro): 0.5948991949778544
CV Recall (macro) scores: [0.60181967 0.61234921 0.55264772 0.5986896  0.59377559]
Mean CV Recall (macro): 0.5918563588774178
CV Log Loss scores: [0.6575671  0.65470512 0.68439446 0.68193408 0.68777717]
Mean CV Log Loss: 0.6732755873087408


<h2 style="color: purple;">5.4 Final Model Training and Test Set Evaluation</h2>   

In [22]:
# Retrain on the full training data
final_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_test_preds = final_model.predict(X_test_scaled)

# Evaluate
from sklearn.metrics import classification_report

print("Final Test Set Evaluation:\n")
print(classification_report(y_test, y_test_preds, digits=4))

Final Test Set Evaluation:

              precision    recall  f1-score   support

           0     0.5732    0.6267    0.5987       150
           1     0.5882    0.5333    0.5594       150

    accuracy                         0.5800       300
   macro avg     0.5807    0.5800    0.5791       300
weighted avg     0.5807    0.5800    0.5791       300

