In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/drive/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /temp/, but they won't be saved outside of the current session

## Student Mental Health Analysis

![](https://images.unsplash.com/photo-1534330207526-8e81f10ec6fc?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=2070&q=80)

## Introduction
The importance of mental health in college students cannot be emphasized enough. As students leave everything familiar to them and enter university, they may experience emotional and mental strain. Students from different courses, ages and years may experience mental breakdown at some point in their life. Few aspects are:
##### Anxiety
According to Wikipedia, Anxiety is an emotion which is characterized by an unpleasant state of inner turmoil and includes feelings of dread over anticipated events. It is often accompanied by nervous behavior such as pacing back and forth, somatic complaints, and rumination.
Anxiety often have the following symptoms:

- Disproportionate feelings of nervousness, restlessness, or tension
- An impending sense of doom, danger, or panic without any cause
- Hyperventilating
- Trembling or sweating
- Weakness and fatigue
- Insomnia or difficulty falling asleep
- Problems with appetite (not eating enough or binge eating)
- Nausea or migraines



##### Depression
According to Wikipedia, Classified medically as a mental and behavioral disorder, the experience of depression affects a person's thoughts, behavior, motivation, feelings, and sense of well-being. Depression symptoms include:
- Consistent feelings of hopelessness and sadness
- Mood swings 
- Changes in sleep and/or appetite
- Withdrawal from social circles, a tendency to self-isolate
- Increased pessimism
- Feeling Lathargic
- Difficulty concentrating and completing tasks
- Lack of enjoyment in activities one previously found pleasurable



##### Panic Attack
According to Wikipedia, Panic attacks are sudden periods of intense fear and discomfort that may include palpitations, sweating, chest pain or chest discomfort, shortness of breath, trembling, dizziness, numbness, confusion, or a feeling of impending doom or of losing control. Typically, symptoms reach a peak within ten minutes of onset, and last for roughly 30 minutes, but the duration can vary from seconds to hours.
<br>

This particular dataset involves a survey conducted in an University. Let's have a close look at it!

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
import warnings
warnings.filterwarnings("ignore")

## Loading the Data

In [None]:
data = pd.read_csv('/kaggle/input/student-mental-health/Student Mental health.csv')
data.head()

## Data Cleaning
- Renaming the columns of the data to a more standard convention
- Handling missing values
- Clean duplicates
- Correctly formatting the Year column (year 1 = Year 1)

In [None]:
#Information on features
data.info()

#### From the above info, it can be deduced that:
- Age column has a value missing.
- Timestamp is not parsed as a DateTime object, but we can work with that.
- Course and GPA are object data types and need further exploration.
- All other object data types presumably have Yes/No values which can be preprocessed later.


In [None]:
#drop NA value
data= data[data['Age'].notna()]
data.shape

In [None]:
data.head(1)

In [None]:
#Rename columns
data.columns = ['Timestamp', 'Gender', 'Age', 'Course', 'Year', 'CGPA', 'Marital_Status', 'Depression', 'Anxiety', 'Panic_Attack', 'Treatment']
data.head(1)

In [None]:
#Formatting the Year column
data['Year'].unique()
# array(['year 1', 'year 2', 'Year 1', 'year 3', 'year 4', 'Year 2', 'Year 3'], dtype=object)

##### Few findings
- The maximum duration of any particular course is 4 years as per the data. The minimum duration cannot be determined.
- Year 1  and year 1 mean the same thing (and same with other values) yet are interpreted as different.
- No need of the word 'Year' or 'year', we can work with just the number.

In [None]:
def Clean(Text):
    Text = Text[-1]
    Text = int(Text)
    return Text
data["Year"] = data["Year"].apply(Clean)
print("First three values of Year after cleaning text:")
print(data["Year"][:3], "\n")

In [None]:
data['CGPA'].unique()

##### Few findings
- The CGPA column has ranges rather than an absolute value. 
- The range '3.50 - 4.00' is same as '3.50 - 4.00 ', so we need to trim the trailing whitespace.
- The ranges can be converted to their mean values, but I will keep them as it is for further exploration.

In [None]:
def remove_space(string):
    string = string.strip()
    return string
data["CGPA"] = data["CGPA"].apply(remove_space)
print("First three values of CGPA after cleaning text:")
print(data["CGPA"][:3], "\n")
print(data['CGPA'].unique())

In [None]:
len(data['Course'].unique())

We can observe that a lot of courses are interpreted differently though they mean the same, so we need to take care of that.

In [None]:
#Let's replace redundant course name with the standard course name
data['Course'].replace({'engin': 'Engineering' , 'Engine':'Engineering' , 'Islamic education':'Islamic Education' , 'Pendidikan islam':'Pendidikan Islam' , 'BIT':'IT', 'psychology':'Psychology', 'koe': 'Koe', 'Kirkhs': 'Irkhs', 'KIRKHS': 'Irkhs', 'Benl': 'BENL', 'Fiqh fatwa ': 'Fiqh', 'Laws': 'Law'} , inplace = True)
len(data['Course'].unique())

The data is quite clean and the new feature has been included. Let's proceed to the next step. That is, exploring the data.

## Data Exploration

In this section,

Exploring data for outliers

- Exploring the Course attribute
- Exploring the Evaluation attributes(Depression, Anxiety, Panic_Attack)
<br>

Let us start with pairplot

In [None]:
sns.pairplot(data)

No outliers, Awesome!

##### Let's have an year-wise analysis of students from different courses:

In [None]:
plt.figure(figsize=(15, 7))
chart = sns.countplot(x = 'Course', data = data[data['Year'] == 1])
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
plt.figure(figsize=(15, 7))
chart = sns.countplot(x = 'Course', data = data[data['Year'] == 2])
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
plt.figure(figsize=(15, 7))
chart = sns.countplot(x = 'Course', data = data[data['Year'] == 3])
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
plt.figure(figsize=(15, 7))
chart = sns.countplot(x = 'Course', data = data[data['Year'] == 4])
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

### Year-wise analysis of students from different courses
##### Here are some of the stats:
- Maximum students from Year 4 are from Engineering.
- Missing Courses from the Year 4 (x-axis) must have a course duration of 3 years.
- Engineering, BCS and IT students rule the survey with maximum number of responses.

In [None]:
plt.figure(figsize=(10, 10))
sns.stripplot(x = 'Anxiety', y = 'Course', hue = 'Gender', data = data, palette = ['#546D64', '#50FFB1'])
plt.show()

### Anxiety vs Course
##### Here are some of the stats: 
- Students enrolled in IT experience the maximum anxiety.
- Students enrolled in fields related to Islam(Islamic Education, Pendidikan Islam, Fiqh, Usuluddin, etc.) and Biology(Human Sciences, Nursing, Biomedical Sciences) are less prone to anxiety.
- Computer Science(BCS) has almost an equal number of students who experience anxiety and those who not.  

In [None]:
plt.figure(figsize=(10, 10))
sns.stripplot(x = 'Depression', y = 'Course', hue = 'Gender', data = data, palette = ['#546D64', '#50FFB1'])
plt.show()

### Depression vs Course
##### Here are some of the stats: 
- Males are less prone to experiencing depression as compared to females.
- 2/3 females in Psychology experience depression,... Strange!
- Around 50% of the Students in IT experience depression.

In [None]:
plt.figure(figsize=(10, 10))
sns.stripplot(x = 'Panic_Attack', y = 'Course', hue = 'Gender', data = data, palette = ['red', 'black'])
plt.show()

### Panic Attack vs Course
##### Here are some of the stats: 
- Males are less prone to experiencing panic attacks as compared to females.
- Approximately 37.5% of Engineering students experience panic attacks.
- About 62.5% of IT students experience panic attacks.
- About 18% of BCS students experience panic attacks.


##### Let's have a look at the age distribution of people

In [None]:
plt.figure(figsize=(14,7))
ax = data["Age"].hist(bins=15, density=True, stacked=True, color='red', alpha=0.6)
data["Age"].plot(kind='density', color='black')
plt.show()

#### Few Findings
- We have students from ages between 18 - 24.
- We do not have much responses from students aged 20-23.

**Let's analyse various parameters(Depression, Anxiety, Panic_Attack) yearwise.**

In [None]:
plt.figure(figsize=(14,7))
sns.violinplot(x = 'Year', y = 'Age', data = data,  hue = 'Depression', palette = ['#84BCDA', '#ECC30B'])
plt.show()

In [None]:
plt.figure(figsize=(14,7))
sns.violinplot(x = 'Year', y = 'Age', data = data,  hue = 'Anxiety', palette = ['#72A98F', '#CBEF43'])
plt.show()

In [None]:
plt.figure(figsize=(14,7))
sns.violinplot(x = 'Year', y = 'Age', data = data,  hue = 'Panic_Attack', palette = ['#FEC0AA', '#EC4E20'])
plt.show()

### Key Findings:
- Year 4 students do not experience Depression, Anxiety or Panic Attacks except for those who are aged 24
- Year 3 has a versatile distribution of students. Mixed reviews.
- Year 1 students aged between 18 - 20 experience the most amongst depression, anxiety, panic attacks.
- Year 3 students are more anxiety prone.

**Does CGPA affect mental health?** Let's find out!

In [None]:
sns.catplot(data=data, x="Year", y="Age", hue="CGPA", kind="violin", palette="Pastel1",height=10,aspect=2)

### Few Findings:
- Year 3 and 4 students who perform academically well and therefore have slight or no mental health problems.
- Many Year 3 Students have thier GPAs under 2.0.
- Students from Year 1 and Year 2 perform academically better having their GPAs above 2.5.
- Year 1 students aged 18- 20 despite having decent GPAs experience mental breakdowns. How is that possible? Maybe self-doubt, imposter syndrome, etc. (Just an assumption) 

Up next I will plot the classic correlation matrix with a few significant columns.

In [None]:
#correlation matrix
corrmat= data.corr()
plt.figure(figsize=(5,5))  
sns.heatmap(corrmat,annot=True, cmap=None)

##### No way this helps!

## Data Preprocessing

In this section,

##### The following steps are involved:

- we will perform label encoding to the columns (CGPA, Depression, Anxiety, Panick_Attack) to get unique numerical value to each attribute.
- I think we do not require Timestamp, so we'll drop it out.
- Assigning labels and targets.
- Splitting the test and training sets.

In [None]:
data.drop('Timestamp', axis=1, inplace=True)

In [None]:
data.head(20)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
categorical_columns= [x for x in data.columns if data.dtypes[x] == 'object']
for column in categorical_columns:
    data[column] = encoder.fit_transform(data[column])
data.head()

In [None]:
#correlation matrix
corrmat= data.corr()
plt.figure(figsize=(10,10))  
sns.heatmap(corrmat,annot=True, cmap=None)

#### Few Findings:
- Marital_Status shows a close association with Depression.
- Depression. Anxiety, Panick_Attack show a significant correlation.
- Medical assistance(Treatment) shows a slight correlation with Marital_Status 

## Model Selection

For model selection, I will be building pipelines of five different classifiers and select one with the best fit results.

In this section:
- Split data into trainig and testing sets
- Assigning targets and features
- The model pipelines with preprocessing:
- Fitting the training set to the various models.
- Getting the confusion matrix and accuracy scores.
- Picking the best classifier.

In [None]:
X = data.drop(["CGPA"],axis=1)
y = data["CGPA"]

In [None]:
#spliting test and training sets
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
#A quick model selection process
#pipelines of models( it is short was to fit and pred)
pipeline_lr=Pipeline([('lr_classifier',LogisticRegression(random_state=42))])

pipeline_dt=Pipeline([ ('dt_classifier',DecisionTreeClassifier(random_state=42))])

pipeline_rf=Pipeline([('rf_classifier',RandomForestClassifier())])

pipeline_svc=Pipeline([('sv_classifier',SVC())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_svc]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest', 3: "SVC"}


# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

#cross validation on accuracy 
cv_results_accuracy = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train, cv=10 )
    cv_results_accuracy.append(cv_score)
    print("%s: %f " % (pipe_dict[i], cv_score.mean()))

So Random Forest does best amongst the models to be the most accurate. Let us build a better random forest with grid search cv. Let's find out how it performs on testset

## Model Evaluation

##### The following steps are involved:
- Build a Random Forest model.
- Create classification report.
- Visualize the confusion matrix

In [None]:
#taking look at the test set
pred_rfc = pipeline_rf.predict(X_test)
accuracy = accuracy_score(y_test, pred_rfc)
print(accuracy)

In [None]:
RF_model = RandomForestClassifier()
RF_model.fit(X_train, y_train)
#Testing the Model on test set
predictions=RF_model.predict(X_test)
acccuracy= accuracy_score(y_test,predictions)
acccuracy

In [None]:
acccuracy = accuracy_score(y_test, predictions)
recall = recall_score(y_test, predictions, average="weighted")
precision = precision_score(y_test, predictions, average="weighted")
f1_score = f1_score(y_test, predictions, average="micro")

print("********* Random Forest Results *********")
print("Accuracy    : ", acccuracy)
print("Recall      : ", recall)
print("Precision   : ", precision)
print("F1 Score    : ", f1_score)

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# confusion matrix
plt.subplots(figsize=(12,8))
cf_matrix = confusion_matrix(y_test, predictions)
sns.heatmap(cf_matrix/np.sum(cf_matrix), cmap=None,annot = True, annot_kws = {'size':15})

## Conclusion

#### If you found it helpful, do upvote!
#### Feel free to comment!
#### I would love to have suggestions.
#### Cheers y'all❤️