# Import Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import chi2_contingency
from sklearn.feature_selection import chi2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Mental Health Research/Fisher-PCA-Rfe/'
data = pd.read_csv(file_path + 'main_dataset_mental_health.csv')
data = data.drop(columns=["Depression Level", 'score'])
data.columns

Index(['Academic Pressure', 'Syllabus Size', 'Cooperation Teachers/Peers',
       'Satisfaction on Result', 'Hopefulness about Future',
       'Suicidal Thoughts (Academic Stress)', 'Surrounding Condition',
       'Food Quality', 'Relationship with Parents',
       'Relationships with Siblings', 'Financial Support to Family',
       'Receiving Financial Support', 'Family Pressure', 'Family Expectations',
       'Self Confidence Level', 'Communication Skill', 'Concentration Level',
       'Loneliness Level', 'Frequency of Crying', 'Time Spent on Social Media',
       'Number of Friends', 'Work-Personal Life Balance',
       'Overthinking Frequency', 'Getting Upset Easily',
       'Desire to Escape Reality', 'Frequency of Death Thoughts',
       'Responsibility for Negative Events', 'Label'],
      dtype='object')

In [4]:
data.shape[1]

28

# Feature Selection and Extraction

## Fisher

In [5]:
# Separate the features and class label
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

In [6]:
# Check for NaN values in the dataset
print(np.isnan(X_normalized).sum())
print(y.isnull().sum())

7
0


In [7]:
# Impute missing values (e.g., fill with 0)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_normalized = imputer.fit_transform(X_normalized)

In [8]:
# Check for NaN values in the dataset
print(np.isnan(X_normalized).sum())
print(y.isnull().sum())

0
0


In [9]:
# Calculate the Fisher's Score test statistic and p-values for each feature
f_scores, p_values = chi2(X_normalized, y)

# Rank the features based on their F-scores
sorted_indices = sorted(range(len(f_scores)), key=lambda i: f_scores[i], reverse=True)
feature_ranks = X.columns[sorted_indices]

# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks, 'F-Score': f_scores[sorted_indices], 'p-value': p_values[sorted_indices]})
ranked_features_df.to_csv('feature_ranks_fisher.csv', index=False)


# Rearrange the dataset based on the feature ranks
rearranged_data = data[feature_ranks.tolist() + ['Label']]

rearranged_data

Unnamed: 0,Suicidal Thoughts (Academic Stress),Desire to Escape Reality,Hopefulness about Future,Self Confidence Level,Frequency of Crying,Loneliness Level,Getting Upset Easily,Frequency of Death Thoughts,Relationships with Siblings,Responsibility for Negative Events,...,Communication Skill,Satisfaction on Result,Time Spent on Social Media,Academic Pressure,Receiving Financial Support,Food Quality,Syllabus Size,Financial Support to Family,Family Expectations,Label
0,1,5.0,5,4,3,4,2,1.0,1,4,...,3,5,2,5,1,1,4,1,5,1
1,1,1.0,1,1,1,4,2,1.0,2,1,...,1,5,4,3,1,4,4,1,5,0
2,3,1.0,2,2,3,3,4,3.0,1,4,...,2,3,3,5,3,5,5,1,5,1
3,1,4.0,3,3,1,2,2,1.0,1,3,...,5,4,5,4,1,1,5,1,5,1
4,1,4.0,2,3,4,4,5,4.0,2,5,...,2,5,4,3,1,5,3,1,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,1,3.0,2,2,2,5,5,1.0,1,3,...,3,5,4,5,1,1,5,3,1,1
310,1,1.0,3,2,1,1,4,2.0,2,3,...,3,3,3,4,2,1,4,5,5,1
311,2,3.0,5,3,5,5,3,3.0,1,5,...,3,5,3,5,1,3,5,3,3,1
312,1,4.0,2,2,4,4,5,5.0,1,3,...,3,1,4,3,1,3,3,1,5,1


In [10]:
ranked_features_df

Unnamed: 0,Feature,F-Score,p-value
0,Suicidal Thoughts (Academic Stress),43.639544,3.340361e-10
1,Desire to Escape Reality,33.077547,6.566015e-08
2,Hopefulness about Future,26.519763,1.743037e-06
3,Self Confidence Level,26.437422,1.816297e-06
4,Frequency of Crying,24.369551,5.107629e-06
5,Loneliness Level,23.457646,8.058177e-06
6,Getting Upset Easily,23.359572,8.463175e-06
7,Frequency of Death Thoughts,22.186157,1.521728e-05
8,Relationships with Siblings,20.258394,3.989749e-05
9,Responsibility for Negative Events,19.818779,4.970577e-05


In [11]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_fisher.csv', index=False)

## PCA

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [13]:
# Load the dataset
data_PCA = pd.read_csv('rearranged_data_fisher.csv')
data_PCA

Unnamed: 0,Suicidal Thoughts (Academic Stress),Desire to Escape Reality,Hopefulness about Future,Self Confidence Level,Frequency of Crying,Loneliness Level,Getting Upset Easily,Frequency of Death Thoughts,Relationships with Siblings,Responsibility for Negative Events,...,Communication Skill,Satisfaction on Result,Time Spent on Social Media,Academic Pressure,Receiving Financial Support,Food Quality,Syllabus Size,Financial Support to Family,Family Expectations,Label
0,1,5.0,5,4,3,4,2,1.0,1,4,...,3,5,2,5,1,1,4,1,5,1
1,1,1.0,1,1,1,4,2,1.0,2,1,...,1,5,4,3,1,4,4,1,5,0
2,3,1.0,2,2,3,3,4,3.0,1,4,...,2,3,3,5,3,5,5,1,5,1
3,1,4.0,3,3,1,2,2,1.0,1,3,...,5,4,5,4,1,1,5,1,5,1
4,1,4.0,2,3,4,4,5,4.0,2,5,...,2,5,4,3,1,5,3,1,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,1,3.0,2,2,2,5,5,1.0,1,3,...,3,5,4,5,1,1,5,3,1,1
310,1,1.0,3,2,1,1,4,2.0,2,3,...,3,3,3,4,2,1,4,5,5,1
311,2,3.0,5,3,5,5,3,3.0,1,5,...,3,5,3,5,1,3,5,3,3,1
312,1,4.0,2,2,4,4,5,5.0,1,3,...,3,1,4,3,1,3,3,1,5,1


In [14]:
# Separate the features and class label
X = data_PCA.iloc[:, :27]
y = data_PCA.iloc[:, -1]

# Normalize the features using StandardScaler
scaler = StandardScaler()

X_normalized = scaler.fit_transform(X)
X_normalized = X
X_normalized = X_normalized.fillna(0)

In [15]:
# Apply PCA to the normalized features
pca = PCA()
pca.fit(X_normalized)

In [16]:
# Calculate the variance of all principal components
variance = pca.explained_variance_ratio_

# Rank the principal components based on their variance
sorted_indices = sorted(range(len(variance)), key=lambda i: variance[i], reverse=True)

# Save the variance values of all principal components to a CSV file
variance_df = pd.DataFrame({'Principal Component': range(1, len(variance)+1), 'Variance': variance[sorted_indices]})
variance_df.to_csv('pca_variance.csv', index=False)

variance_df

Unnamed: 0,Principal Component,Variance
0,1,0.27179
1,2,0.070923
2,3,0.059331
3,4,0.048349
4,5,0.046607
5,6,0.044841
6,7,0.0413
7,8,0.038421
8,9,0.035808
9,10,0.032028


In [17]:
# Rearrange the dataset based on the variance of the principal components
X_transformed = pca.transform(X_normalized)[:, sorted_indices]
rearranged_data = pd.DataFrame(X_transformed, columns=['PCA ' + str(i+1) for i in range(X_transformed.shape[1])])
rearranged_data['Class'] = y

rearranged_data

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 19,PCA 20,PCA 21,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,Class
0,0.858683,-0.180556,0.581471,-2.968810,-0.711392,-2.281555,0.325962,1.217227,-1.873681,0.660209,...,-1.348916,-0.009683,-0.484969,-0.161070,-0.968913,-0.405107,1.355269,0.596170,-0.814424,1
1,-4.775670,0.951902,1.238174,0.132458,0.821940,-0.897392,-1.622403,1.784514,-0.375979,0.967598,...,1.207526,-0.818173,0.852012,-0.588487,0.338856,-0.334611,-0.310629,-0.260074,0.611944,0
2,-0.797512,2.506132,2.395035,1.265972,0.454961,-0.294006,-0.635710,-2.171679,-1.529983,0.424539,...,-0.301262,-0.523121,0.091659,0.103445,-0.564585,0.590561,-0.230430,-0.140762,0.258528,1
3,-1.695993,-2.355874,-1.858814,-1.730071,0.535378,-0.353564,2.689364,0.319821,-1.547758,-0.171142,...,1.848237,0.154009,1.560637,-0.241720,-0.776525,-0.638540,-0.964050,1.585483,-0.247960,1
4,3.090131,0.210064,-0.051429,2.042090,1.377784,1.102413,-1.300999,1.711703,-0.248604,-0.883575,...,0.091706,-0.661090,0.947121,0.373800,1.800076,1.055791,-0.427987,0.188862,-0.164317,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,-0.716034,0.156499,-2.099905,0.389739,-1.491896,-2.966669,0.936042,1.292133,-0.274947,-1.971603,...,0.375575,1.160426,-1.859343,0.564737,-1.774809,0.797940,-0.249875,0.960234,0.088656,1
310,-2.574612,0.681876,-0.776889,-1.743165,1.167514,-0.921519,1.992501,-2.283291,0.568167,-0.187915,...,-1.397232,1.222241,-0.603970,-0.485554,0.621489,-1.003151,0.148079,-0.741559,-0.127989,1
311,2.387283,0.925110,0.618235,0.568452,-2.080363,-0.847816,0.931340,0.342121,-1.948360,0.908450,...,0.302545,2.210988,-0.111399,-0.038817,-0.674023,-0.292276,0.385969,-0.025436,0.250600,1
312,-0.321714,2.630680,-2.756261,0.910438,0.278941,-0.159928,-1.879983,-0.451965,0.709125,-0.439537,...,0.367760,0.625096,-0.077731,-0.773670,0.309566,0.215507,-0.468401,-0.338167,-0.001359,1


In [18]:
# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_pca.csv', index=False)

## RFE

In [19]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


In [20]:
# Load the dataset
data_RFE = pd.read_csv('rearranged_data_pca.csv')
data_RFE

Unnamed: 0,PCA 1,PCA 2,PCA 3,PCA 4,PCA 5,PCA 6,PCA 7,PCA 8,PCA 9,PCA 10,...,PCA 19,PCA 20,PCA 21,PCA 22,PCA 23,PCA 24,PCA 25,PCA 26,PCA 27,Class
0,0.858683,-0.180556,0.581471,-2.968810,-0.711392,-2.281555,0.325962,1.217227,-1.873681,0.660209,...,-1.348916,-0.009683,-0.484969,-0.161070,-0.968913,-0.405107,1.355269,0.596170,-0.814424,1
1,-4.775670,0.951902,1.238174,0.132458,0.821940,-0.897392,-1.622403,1.784514,-0.375979,0.967598,...,1.207526,-0.818173,0.852012,-0.588487,0.338856,-0.334611,-0.310629,-0.260074,0.611944,0
2,-0.797512,2.506132,2.395035,1.265972,0.454961,-0.294006,-0.635710,-2.171679,-1.529983,0.424539,...,-0.301262,-0.523121,0.091659,0.103445,-0.564585,0.590561,-0.230430,-0.140762,0.258528,1
3,-1.695993,-2.355874,-1.858814,-1.730071,0.535378,-0.353564,2.689364,0.319821,-1.547758,-0.171142,...,1.848237,0.154009,1.560637,-0.241720,-0.776525,-0.638540,-0.964050,1.585483,-0.247960,1
4,3.090131,0.210064,-0.051429,2.042090,1.377784,1.102413,-1.300999,1.711703,-0.248604,-0.883575,...,0.091706,-0.661090,0.947121,0.373800,1.800076,1.055791,-0.427987,0.188862,-0.164317,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,-0.716034,0.156499,-2.099905,0.389739,-1.491896,-2.966669,0.936042,1.292133,-0.274947,-1.971603,...,0.375575,1.160426,-1.859343,0.564737,-1.774809,0.797940,-0.249875,0.960234,0.088656,1
310,-2.574612,0.681876,-0.776889,-1.743165,1.167514,-0.921519,1.992501,-2.283291,0.568167,-0.187915,...,-1.397232,1.222241,-0.603970,-0.485554,0.621489,-1.003151,0.148079,-0.741559,-0.127989,1
311,2.387283,0.925110,0.618235,0.568452,-2.080363,-0.847816,0.931340,0.342121,-1.948360,0.908450,...,0.302545,2.210988,-0.111399,-0.038817,-0.674023,-0.292276,0.385969,-0.025436,0.250600,1
312,-0.321714,2.630680,-2.756261,0.910438,0.278941,-0.159928,-1.879983,-0.451965,0.709125,-0.439537,...,0.367760,0.625096,-0.077731,-0.773670,0.309566,0.215507,-0.468401,-0.338167,-0.001359,1


In [21]:
# Separate the features and class label
X = data_RFE.iloc[:, :-1]
y = data_RFE.iloc[:, -1]

# Normalize the features using MinMaxScaler
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Define the random forest classifier
rf = RandomForestClassifier(n_estimators=100)

# Apply RFE with cross-validation to rank the features
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)
rfe.fit(X_normalized, y)

# Rank the features based on their importance scores
feature_ranks = X.columns[rfe.ranking_ - 1]

# Save the feature ranks to a CSV file
ranked_features_df = pd.DataFrame({'Feature': feature_ranks})
ranked_features_df.to_csv('feature_ranks_rfe_rf.csv', index=False)

ranked_features_df

Unnamed: 0,Feature
0,PCA 1
1,PCA 12
2,PCA 2
3,PCA 21
4,PCA 5
5,PCA 3
6,PCA 25
7,PCA 18
8,PCA 23
9,PCA 14


In [22]:
# Rearrange the dataset based on the feature ranks
rearranged_data = data_RFE[feature_ranks.tolist() + ['Class']]

# Save the rearranged dataset to a CSV file
rearranged_data.to_csv('rearranged_data_rfe_rf.csv', index=False)

In [23]:
rearranged_data

Unnamed: 0,PCA 1,PCA 12,PCA 2,PCA 21,PCA 5,PCA 3,PCA 25,PCA 18,PCA 23,PCA 14,...,PCA 10,PCA 26,PCA 16,PCA 13,PCA 15,PCA 4,PCA 20,PCA 6,PCA 19,Class
0,0.858683,-0.540591,-0.180556,-0.484969,-0.711392,0.581471,1.355269,-0.612970,-0.968913,1.429104,...,0.660209,0.596170,1.755501,1.696234,0.831008,-2.968810,-0.009683,-2.281555,-1.348916,1
1,-4.775670,-0.577889,0.951902,0.852012,0.821940,1.238174,-0.310629,0.303161,0.338856,0.509728,...,0.967598,-0.260074,-0.393626,0.249147,0.202571,0.132458,-0.818173,-0.897392,1.207526,0
2,-0.797512,-0.276855,2.506132,0.091659,0.454961,2.395035,-0.230430,0.557836,-0.564585,-0.045123,...,0.424539,-0.140762,0.552634,-0.784931,-0.266889,1.265972,-0.523121,-0.294006,-0.301262,1
3,-1.695993,0.538111,-2.355874,1.560637,0.535378,-1.858814,-0.964050,0.040170,-0.776525,-0.961901,...,-0.171142,1.585483,1.334039,-0.549059,-0.147673,-1.730071,0.154009,-0.353564,1.848237,1
4,3.090131,-1.732576,0.210064,0.947121,1.377784,-0.051429,-0.427987,-0.423814,1.800076,0.472942,...,-0.883575,0.188862,0.400845,-0.295621,-0.867347,2.042090,-0.661090,1.102413,0.091706,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309,-0.716034,-1.107172,0.156499,-1.859343,-1.491896,-2.099905,-0.249875,-0.522400,-1.774809,0.006990,...,-1.971603,0.960234,0.753402,2.503661,0.711056,0.389739,1.160426,-2.966669,0.375575,1
310,-2.574612,-0.617771,0.681876,-0.603970,1.167514,-0.776889,0.148079,-0.692268,0.621489,1.632151,...,-0.187915,-0.741559,0.656745,-0.606389,-2.092557,-1.743165,1.222241,-0.921519,-1.397232,1
311,2.387283,-1.719347,0.925110,-0.111399,-2.080363,0.618235,0.385969,0.096829,-0.674023,0.970884,...,0.908450,-0.025436,1.661419,1.732353,-0.994154,0.568452,2.210988,-0.847816,0.302545,1
312,-0.321714,-0.233579,2.630680,-0.077731,0.278941,-2.756261,-0.468401,-0.920761,0.309566,-0.468675,...,-0.439537,-0.338167,-0.284552,-0.607802,1.722291,0.910438,0.625096,-0.159928,0.367760,1


# Save Data

In [24]:
rearranged_data.to_csv(file_path + 'rearranged_data_rfe_rf.csv', index=False)