# **This notebook is for building a classification model**

In [1]:
#Call libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read dataset
df= pd.read_csv('Anxiety.csv')
df.head()

Unnamed: 0,age,gender,bmi,who_bmi,phq_score,depression_severity,depressiveness,suicidal,depression_diagnosis,depression_treatment,gad_score,anxiety_severity,anxiousness,anxiety_diagnosis,anxiety_treatment,epworth_score,sleepiness
0,19,male,33.333333,Class I Obesity,9,Mild,False,False,False,False,11,Moderate,True,False,False,7.0,False
1,18,male,19.84127,Normal,8,Mild,False,False,False,False,5,Mild,False,False,False,14.0,True
2,19,male,25.102391,Overweight,8,Mild,False,False,False,False,6,Mild,False,False,False,6.0,False
3,18,female,23.738662,Normal,19,Moderately severe,True,True,False,False,15,Severe,True,False,False,11.0,True
4,18,male,25.617284,Overweight,6,Mild,False,False,False,False,14,Moderate,True,False,False,3.0,False


In [3]:
#How many rows and columns are there?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1566 non-null   int64  
 1   gender                1566 non-null   object 
 2   bmi                   1566 non-null   float64
 3   who_bmi               1566 non-null   object 
 4   phq_score             1566 non-null   int64  
 5   depression_severity   1558 non-null   object 
 6   depressiveness        1560 non-null   object 
 7   suicidal              1564 non-null   object 
 8   depression_diagnosis  1564 non-null   object 
 9   depression_treatment  1558 non-null   object 
 10  gad_score             1566 non-null   int64  
 11  anxiety_severity      1566 non-null   object 
 12  anxiousness           1554 non-null   object 
 13  anxiety_diagnosis     1558 non-null   object 
 14  anxiety_treatment     1562 non-null   object 
 15  epworth_score        

In [4]:
# Get the dimensions of the Dataset
df.shape

(1566, 17)

In [5]:
#Find out if there are missing values ​​or not
df.isna().any()

age                     False
gender                  False
bmi                     False
who_bmi                 False
phq_score               False
depression_severity      True
depressiveness           True
suicidal                 True
depression_diagnosis     True
depression_treatment     True
gad_score               False
anxiety_severity        False
anxiousness              True
anxiety_diagnosis        True
anxiety_treatment        True
epworth_score            True
sleepiness               True
dtype: bool

In [6]:
#Delete rows with missing data, which are only 12
df = df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1530 entries, 0 to 1565
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   1530 non-null   int64  
 1   gender                1530 non-null   object 
 2   bmi                   1530 non-null   float64
 3   who_bmi               1530 non-null   object 
 4   phq_score             1530 non-null   int64  
 5   depression_severity   1530 non-null   object 
 6   depressiveness        1530 non-null   object 
 7   suicidal              1530 non-null   object 
 8   depression_diagnosis  1530 non-null   object 
 9   depression_treatment  1530 non-null   object 
 10  gad_score             1530 non-null   int64  
 11  anxiety_severity      1530 non-null   object 
 12  anxiousness           1530 non-null   object 
 13  anxiety_diagnosis     1530 non-null   object 
 14  anxiety_treatment     1530 non-null   object 
 15  epworth_score         1530

In [7]:
# Get the dimensions of the Dataset After deletion
df.shape

(1530, 17)

In [8]:
#Finding object data locatobjections
df.select_dtypes('object').describe().transpose()


Unnamed: 0,count,unique,top,freq
gender,1530,2,female,788
who_bmi,1530,7,Normal,994
depression_severity,1530,6,Mild,678
depressiveness,1530,2,False,1120
suicidal,1530,2,False,1400
depression_diagnosis,1530,2,False,1398
depression_treatment,1530,2,False,1416
anxiety_severity,1530,4,Mild,586
anxiousness,1530,2,False,1150
anxiety_diagnosis,1530,2,False,1408


In [9]:
#Categorical column encoding
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['who_bmi'] = label_encoder.fit_transform(df['who_bmi'])
df['depression_severity'] = label_encoder.fit_transform(df['depression_severity'])
df['anxiety_severity'] = label_encoder.fit_transform(df['anxiety_severity'])
df['anxiety_diagnosis'] = label_encoder.fit_transform(df['anxiety_diagnosis'])  


In [10]:
# drop useless columns
df.drop(columns=['depression_treatment', 'suicidal', 'epworth_score'], inplace=True)

In [11]:
#Overview of the data after tuning
df.describe()

Unnamed: 0,age,gender,bmi,who_bmi,phq_score,depression_severity,gad_score,anxiety_severity,anxiety_diagnosis
count,1530.0,1530.0,1530.0,1530.0,1530.0,1530.0,1530.0,1530.0,1530.0
mean,20.257516,0.484967,23.410511,3.517647,7.16732,1.315033,6.882353,1.15817,0.079739
std,1.771908,0.499937,4.588703,1.221677,4.418965,1.417944,4.728524,1.035575,0.270977
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,20.957274,3.0,4.0,0.0,3.0,0.0,0.0
50%,20.0,0.0,23.148148,3.0,6.0,1.0,6.0,1.0,0.0
75%,21.0,1.0,25.510204,5.0,9.0,3.0,9.0,2.0,0.0
max,31.0,1.0,54.552668,6.0,24.0,5.0,21.0,3.0,1.0


In [12]:
#Separate features from target variables
X = df.drop(columns=['gad_score', 'anxiety_severity'])
y = df[['gad_score', 'anxiety_severity']]

In [13]:
#Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
#Feature Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
#Setting up GridSearch
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}


In [16]:
#create a RandomForestClassifier model
model = RandomForestClassifier(random_state=42)

In [17]:
#Setting up GridSearch
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')


In [18]:
#GridSearch Training
grid_search.fit(X_train, y_train)

In [19]:
#get the best model
best_model = grid_search.best_estimator_

In [20]:
#predicting using the best model
y_pred = best_model.predict(X_test)

In [21]:
#model Evaluation
accuracy_gad_score = accuracy_score(y_test['gad_score'], y_pred[:, 0])
accuracy_anxiety_severity = accuracy_score(y_test['anxiety_severity'], y_pred[:, 1])

In [22]:
#Best parameters 
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}


In [23]:
#accuracy for gad_score
print("Accuracy for gad_score:", accuracy_gad_score)

Accuracy for gad_score: 0.8006535947712419


In [24]:
#accuracy for anxiety_severity(this is the level of anxiety)
print("Accuracy for anxiety_severity:", accuracy_anxiety_severity)

Accuracy for anxiety_severity: 0.934640522875817


In [25]:
print("Classification report for gad_score:")
print(classification_report(y_test['gad_score'], y_pred[:, 0]))

Classification report for gad_score:
              precision    recall  f1-score   support

           0       0.75      0.67      0.71        18
           1       0.88      0.65      0.75        23
           2       0.83      0.88      0.86        17
           3       0.63      0.85      0.72        20
           4       0.83      0.83      0.83        35
           5       0.80      0.86      0.83        28
           6       1.00      0.90      0.95        20
           7       0.78      0.78      0.78        27
           8       0.87      0.76      0.81        34
           9       0.75      0.90      0.82        20
          10       0.75      0.75      0.75         8
          11       0.60      1.00      0.75         9
          12       1.00      1.00      1.00         5
          13       1.00      0.25      0.40         8
          14       0.75      1.00      0.86         6
          15       0.67      0.67      0.67         6
          16       1.00      0.75      0.86 

In [26]:
print("Classification report for anxiety_severity:")
print(classification_report(y_test['anxiety_severity'], y_pred[:, 1]))

Classification report for anxiety_severity:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       129
           1       0.95      1.00      0.97        36
           2       0.91      0.93      0.92       113
           3       1.00      0.93      0.96        28

    accuracy                           0.93       306
   macro avg       0.95      0.95      0.95       306
weighted avg       0.94      0.93      0.93       306



In [27]:
import pickle

In [28]:
pickle.dump(best_model,open('model.pkl','wb'))

In [29]:
model=pickle.load(open('model.pkl','rb'))