
#Mount the drive



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#Load the dataset

In [None]:
import pandas as pd

# Update the file path if it's in a specific folder, e.g., '/content/drive/MyDrive/folder_name/Data.csv'
data = pd.read_csv('/content/drive/MyDrive/final_thesis_dataset.csv')

# Display the first few rows to confirm it loaded correctly
data.head()


Unnamed: 0,Timestamp,Gender,Academic year,Name of my University,Religion,Relationship status,CGPA (4.00 Scale),Experience of violence,Residence,Satisfied with result,...,Q51,Q52,Q53,Q54,Q55,Q56,Q57,Anxiety_sum,Anxiety_level,Anxiety
0,28/06/2024 11:17:51,Male,Graduate (স্নাতক),University of Barishal,Islam,Unmarried,greater than3.50,zero,With family,Both,...,3,2,2,2,3,2,3,74,Mild,Yes
1,28/06/2024 11:22:58,Male,Graduate (স্নাতক),Bu,Islam,Unmarried,3.00 to 3.50,zero,In a hostel/hall/other,Both,...,3,5,3,5,4,5,5,136,Severely Moderate,Yes
2,28/06/2024 11:25:02,Male,Graduate (স্নাতক),CUET,Islam,Unmarried,less than3.00,zero,In a hostel/hall/other,Both,...,3,3,2,3,2,2,1,84,Mild,Yes
3,28/06/2024 12:03:22,Male,Graduate (স্নাতক),university of Barishal,Islam,In a complex relationship,3.00 to 3.50,Verbal,With family,zero,...,5,4,2,5,4,4,2,125,Severely Moderate,Yes
4,28/06/2024 12:12:15,Female,Graduate (স্নাতক),University of Barisal,Islam,Unmarried,3.00 to 3.50,zero,In a hostel/hall/other,zero,...,1,2,2,2,2,2,4,76,Mild,Yes


#Handling missing values

In [None]:
# Drop rows with any null values
data_cleaned = data.dropna()

# Display the number of rows remaining after removing null values
remaining_data_count = data_cleaned.shape[0]
print(f"Number of rows after removing null values: {remaining_data_count}")



# Find rows with any null values
rows_with_nulls = data[data.isnull().any(axis=1)]

# Display row indices of rows with null values
print("Indices of rows with null values:")
print(rows_with_nulls.index.tolist())

# Optionally, display the first few columns of rows with null values for context
# You can adjust the column range if needed
rows_with_nulls.iloc[:, :10]


Number of rows after removing null values: 659
Indices of rows with null values:
[]


Unnamed: 0,Timestamp,Gender,Academic year,Name of my University,Religion,Relationship status,CGPA (4.00 Scale),Experience of violence,Residence,Satisfied with result


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import numpy as np

# Assuming data_cleaned is your cleaned DataFrame with no NaN values
y = data_cleaned['Depression_level'].map({
    'Minimal': 0,
    'Mild': 1,
    'Moderate': 2,
    'Severely Moderate': 3,
    'Severe Depression': 4
})

# Select features from Q26 onwards, assuming Q26-Q57 are columns 25 to end
X = data_cleaned.iloc[:, 25:]

# Ensure X and y are aligned and have no NaN values
X = X[y.notna()]
y = y.dropna()

# One-hot encode categorical variables if necessary
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Define base models for stacking
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(kernel='linear', probability=True, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
nb = GaussianNB()
mlp = MLPClassifier(random_state=42, max_iter=1000)

# Define the stacking ensemble with Logistic Regression as the meta model
base_estimators = [
    ('Random Forest', rf),
    ('SVM', svm),
    ('KNN', knn),
    ('Naive Bayes', nb),
    ('MLP', mlp)
]
meta_model = LogisticRegression()

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=5
)

# Train and evaluate the stacking model
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)

# Display the classification report and accuracy
print("--- Stacking Model ---")
print(classification_report(y_test, y_pred_stack))
print(f"Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}\n")

# Calculate and output specificity for each class
conf_matrix = confusion_matrix(y_test, y_pred_stack)
specificities = []
for i in range(len(conf_matrix)):
    tn = np.sum(np.delete(np.delete(conf_matrix, i, axis=0), i, axis=1))
    fp = np.sum(conf_matrix[:, i]) - conf_matrix[i, i]
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    specificities.append(specificity)

print(f"Specificity for each class: {specificities}\n")


--- Stacking Model ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        15
           2       1.00      1.00      1.00        35
           3       1.00      1.00      1.00        30
           4       1.00      1.00      1.00        38

    accuracy                           1.00       132
   macro avg       1.00      1.00      1.00       132
weighted avg       1.00      1.00      1.00       132

Accuracy: 1.0000

Specificity for each class: [1.0, 1.0, 1.0, 1.0, 1.0]



#feature extraction(10)

In [None]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Prepare data
X = data_cleaned.iloc[:, 1:26]  # Q1-Q25 are in the first 25 columns
y = data_cleaned['Depression_level']

# Map depression levels to numeric values
y = y.map({'Minimal': 0, 'Mild': 1, 'Moderate': 2, 'Severely Moderate': 3, 'Severe Depression': 4})

# Drop rows with NaN values in y
X = X[y.notna()]
y = y.dropna()

# One-hot encoding for categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Lasso Regression for feature selection
lasso = Lasso(alpha=0.1)
lasso.fit(X_scaled, y)

# Get importance
importance = np.abs(lasso.coef_)
feature_names = X_encoded.columns  # Adjust based on encoded columns

# Create DataFrame of feature importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})

# Filter to include only Q1 to Q25 features
# (Assuming feature names start with 'Q' followed by numbers)
top_features = importance_df[importance_df['Feature'].str.match(r'^Q[1-9]|^Q1[0-9]|^Q2[0-5]')]
top_features = top_features.sort_values(by='Importance', ascending=False).head(15)

print(top_features)


   Feature  Importance
8       Q9    0.233922
12     Q13    0.219739
11     Q12    0.199807
1       Q2    0.111036
2       Q3    0.095813
9      Q10    0.092045
7       Q8    0.089864
10     Q11    0.076483
6       Q7    0.067593
3       Q4    0.054080
4       Q5    0.048554
5       Q6    0.044054
0       Q1    0.042672


In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

# Select top 20 features from Lasso importance
selected_features = top_features['Feature'].values
X_top_features = X_encoded[selected_features]

# Split data based on selected features
X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size=0.2, random_state=42)

# Define base models
estimators = [
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('SVM', SVC(kernel='linear', probability=True, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB()),
    ('MLP', MLPClassifier(random_state=42, max_iter=1000))
]

# Define the StackingClassifier with a meta-classifier (Logistic Regression)
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),  # Meta-classifier
    cv=5  # Number of cross-validation folds for stacking
)

# Train the Stacking model
stacking_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = stacking_model.predict(X_test)

# Output the classification report
print("--- Stacking Model ---")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")

# Calculate specificity for each class
conf_matrix = confusion_matrix(y_test, y_pred)
specificities = []
for i in range(len(conf_matrix)):
    tn = np.sum(np.delete(np.delete(conf_matrix, i, axis=0), i, axis=1))
    fp = np.sum(conf_matrix[:, i]) - conf_matrix[i, i]
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    specificities.append(specificity)

print(f"Specificity for each class: {specificities}\n")


--- Stacking Model ---
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.69      0.73      0.71        15
           2       0.78      0.80      0.79        35
           3       0.79      0.87      0.83        30
           4       1.00      0.89      0.94        38

    accuracy                           0.85       132
   macro avg       0.85      0.84      0.85       132
weighted avg       0.86      0.85      0.85       132

Accuracy: 0.8485

Specificity for each class: [1.0, 0.9572649572649573, 0.9175257731958762, 0.9313725490196079, 1.0]



#Anxiety section

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Prepare data
X_anxiety = data_cleaned.iloc[:, 25:57]  # Selecting relevant features
y_anxiety = data_cleaned['Anxiety_level'].map({
    'Minimal': 0, 'Mild': 1, 'Moderate': 2, 'Severely Moderate': 3, 'Severe Anxiety': 4
})

# Drop rows with NaN in y if any
X_anxiety = X_anxiety[y_anxiety.notna()]
y_anxiety = y_anxiety.dropna()

# One-hot encode categorical columns if needed
X_encoded_anxiety = pd.get_dummies(X_anxiety, drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled_anxiety = scaler.fit_transform(X_encoded_anxiety)

# Split data
X_train_anxiety, X_test_anxiety, y_train_anxiety, y_test_anxiety = train_test_split(
    X_scaled_anxiety, y_anxiety, test_size=0.2, random_state=42
)

# Define base models
estimators = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('SVM', SVC(kernel='linear', probability=True, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Naive Bayes', GaussianNB()),
    ('MLP', MLPClassifier(max_iter=300, random_state=42))
]

# Define the stacking ensemble with Logistic Regression as meta-classifier
stacking_model_anxiety = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),  # Meta-classifier
    cv=5  # Number of cross-validation folds for stacking
)

# Train the stacking model
stacking_model_anxiety.fit(X_train_anxiety, y_train_anxiety)

# Predictions and evaluation
y_pred_anxiety = stacking_model_anxiety.predict(X_test_anxiety)
print("--- Stacking Model for Anxiety ---")
print(classification_report(y_test_anxiety, y_pred_anxiety))
print(f"Accuracy: {accuracy_score(y_test_anxiety, y_pred_anxiety):.4f}\n")

# Calculate specificity for each class
conf_matrix = confusion_matrix(y_test_anxiety, y_pred_anxiety)
metrics_list = []

for i, label in enumerate(np.unique(y_anxiety.dropna())):
    tn = np.sum(np.delete(np.delete(conf_matrix, i, axis=0), i, axis=1))
    fp = np.sum(conf_matrix[:, i]) - conf_matrix[i, i]
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    metrics_list.append({
        "Class": label,
        "Specificity": specificity
    })

# Create and display specificity table
specificity_table = pd.DataFrame(metrics_list)
print(specificity_table)




--- Stacking Model for Anxiety ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.85      0.65      0.73        17
           2       0.70      0.74      0.72        43
           3       0.68      0.64      0.66        33
           4       0.87      0.94      0.91        36

    accuracy                           0.77       132
   macro avg       0.82      0.79      0.80       132
weighted avg       0.77      0.77      0.76       132

Accuracy: 0.7652

   Class  Specificity
0      0     1.000000
1      1     0.982609
2      2     0.842697
3      3     0.898990
4      4     0.947917




#feature(20)

In [None]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Prepare data
# Select features from Q26 to Q57 excluding Depression columns
X_anxiety_full = data_cleaned.iloc[:, 25:57]  # Adjust this to ensure it selects only Q26-Q57

# Exclude Depression related columns from the selection
# Drop any columns that contain 'Depression' in their name
X_anxiety_full = X_anxiety_full.loc[:, ~X_anxiety_full.columns.str.contains('Depression')]

# Create target variable for anxiety level
y_anxiety_full = data_cleaned['Anxiety_level'].map({'Minimal': 0, 'Mild': 1, 'Moderate': 2, 'Severely Moderate': 3, 'Severe Anxiety': 4})

# Drop rows with NaN in y if any
X_anxiety_full = X_anxiety_full[y_anxiety_full.notna()]
y_anxiety_full = y_anxiety_full.dropna()

# Proceed with one-hot encoding and scaling as before
X_anxiety_encoded = pd.get_dummies(X_anxiety_full, drop_first=True)

# Scale features
scaler = StandardScaler()
X_anxiety_scaled = scaler.fit_transform(X_anxiety_encoded)

# Split data
X_train_anxiety_full, X_test_anxiety_full, y_train_anxiety_full, y_test_anxiety_full = train_test_split(X_anxiety_scaled, y_anxiety_full, test_size=0.2, random_state=42)

# Lasso Regression for feature selection
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_anxiety_full, y_train_anxiety_full)

# Get importance
importance_anxiety = np.abs(lasso.coef_)
feature_names_anxiety = X_anxiety_encoded.columns

# Create DataFrame of feature importances
importance_df_anxiety = pd.DataFrame({'Feature': feature_names_anxiety, 'Importance': importance_anxiety})
top_features_anxiety = importance_df_anxiety.sort_values(by='Importance', ascending=False).head(20)

print(top_features_anxiety)



   Feature  Importance
28     Q41    0.196945
27     Q40    0.162284
22     Q35    0.125618
23     Q36    0.122380
13     Q26    0.088635
25     Q38    0.073178
14     Q27    0.070971
21     Q34    0.057099
20     Q33    0.051611
17     Q30    0.047334
16     Q29    0.043147
19     Q32    0.042647
18     Q31    0.038907
26     Q39    0.027872
6      Q19    0.019640
2      Q15    0.000000
3      Q16    0.000000
24     Q37    0.000000
4      Q17    0.000000
5      Q18    0.000000


In [None]:
!pip install --upgrade scikit-learn




In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Prepare data
# Select features from Q26 to Q57 excluding Depression columns
X_anxiety_full = data_cleaned.iloc[:, 25:57]  # Adjust to ensure it selects only Q26-Q57
X_anxiety_full = X_anxiety_full.loc[:, ~X_anxiety_full.columns.str.contains('Depression')]

# Create target variable for anxiety level
y_anxiety_full = data_cleaned['Anxiety_level'].map({
    'Minimal': 0,
    'Mild': 1,
    'Moderate': 2,
    'Severely Moderate': 3,
    'Severe Anxiety': 4
})

# Drop rows with NaN in y if any
X_anxiety_full = X_anxiety_full[y_anxiety_full.notna()]
y_anxiety_full = y_anxiety_full.dropna()

# One-hot encode categorical columns
X_anxiety_encoded = pd.get_dummies(X_anxiety_full, drop_first=True)

# Scale features
scaler = StandardScaler()
X_anxiety_scaled = scaler.fit_transform(X_anxiety_encoded)

# Split data into train and test sets
X_train_anxiety_full, X_test_anxiety_full, y_train_anxiety_full, y_test_anxiety_full = train_test_split(
    X_anxiety_scaled, y_anxiety_full, test_size=0.2, random_state=42
)

# Lasso Regression for feature selection
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_anxiety_full, y_train_anxiety_full)

# Get feature importance
importance_anxiety = np.abs(lasso.coef_)
feature_names_anxiety = X_anxiety_encoded.columns

# Create DataFrame of feature importances
importance_df_anxiety = pd.DataFrame({'Feature': feature_names_anxiety, 'Importance': importance_anxiety})
top_features_anxiety = importance_df_anxiety.sort_values(by='Importance', ascending=False).head(20)

# Prepare data with only top features
X_top20_anxiety = X_anxiety_encoded[top_features_anxiety['Feature']]
X_train_top20_anxiety, X_test_top20_anxiety, y_train_top20_anxiety, y_test_top20_anxiety = train_test_split(
    X_top20_anxiety, y_anxiety_full, test_size=0.2, random_state=42
)

# Define base models for stacking
rf = RandomForestClassifier(random_state=42)
svm = SVC(probability=True, random_state=42)  # Enable probability estimates for stacking
knn = KNeighborsClassifier()
nb = GaussianNB()

# Create a stacking classifier
base_estimators = [
    ('Random Forest', rf),
    ('SVM', svm),
    ('KNN', knn),
    ('Naive Bayes', nb)
]

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=MLPClassifier(max_iter=300, random_state=42)
)

# Train the stacking model
stacking_model.fit(X_train_top20_anxiety, y_train_top20_anxiety)

# Make predictions with the stacking model
y_pred_stacking_top20_anxiety = stacking_model.predict(X_test_top20_anxiety)

# Display the classification report for the stacking model
print("--- Stacking Model ---")
print(classification_report(y_test_top20_anxiety, y_pred_stacking_top20_anxiety))

# Calculate specificity for each class in the stacking model
conf_matrix_stacking = confusion_matrix(y_test_top20_anxiety, y_pred_stacking_top20_anxiety)
specificities_stacking = []
for i in range(len(conf_matrix_stacking)):
    tn = np.sum(np.delete(np.delete(conf_matrix_stacking, i, axis=0), i, axis=1))
    fp = np.sum(conf_matrix_stacking[:, i]) - conf_matrix_stacking[i, i]
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    specificities_stacking.append(specificity)

print(f"Specificity for each class in stacking model: {specificities_stacking}\n")




--- Stacking Model ---
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.91      0.59      0.71        17
           2       0.70      0.77      0.73        43
           3       0.66      0.64      0.65        33
           4       0.89      0.94      0.92        36

    accuracy                           0.77       132
   macro avg       0.78      0.79      0.77       132
weighted avg       0.77      0.77      0.76       132

Specificity for each class in stacking model: [0.9922480620155039, 0.991304347826087, 0.8426966292134831, 0.8888888888888888, 0.9583333333333334]

