In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pickle

In [3]:
# Reading the anxiety screening data
dataset = pd.read_excel('Depression Screening.xlsx')
dataset.reset_index()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 11 columns):
 #   Column                                                                                                                                                                 Non-Null Count  Dtype 
---  ------                                                                                                                                                                 --------------  ----- 
 0   Little interest or pleasure in doing things                                                                                                                            432 non-null    int64 
 1   Feeling down, depressed, or hopeless                                                                                                                                   432 non-null    int64 
 2   Trouble falling or staying asleep, or sleeping too much                                                       

In [68]:
dataset.head(5)

Unnamed: 0,Little interest or pleasure in doing things,"Feeling down, depressed, or hopeless","Trouble falling or staying asleep, or sleeping too much",Feeling tired or having little energy,Poor appetite or overeating,Feeling bad about yourself or that you are a failure or have let yourself or your family down,"Trouble concentrating on things, such as reading the newspaper or watching television",Moving or speaking so slowly that other people could have noticed. Or the opposite being so figety or restless that you have been moving around a lot more than usual,"Thoughts that you would be better off dead, or of hurting yourself",Total_score,Diagnosis
0,0,0,0,0,0,0,0,0,0,0,Minimal Depression
1,0,0,0,0,0,0,1,0,0,1,Minimal Depression
2,0,0,0,0,0,0,0,0,0,0,Minimal Depression
3,0,0,0,1,0,0,0,0,0,1,Minimal Depression
4,0,0,1,1,0,0,0,0,0,2,Minimal Depression


In [4]:
def change(arg):
     """Changes the arg into numeric value.
    
    PARAMETERS
    -----------
    arg
        either Minimal Depression, Mild Depression, Moderate Depression, or Severe Depression
    
    RETURNS
    ----------
    number
        either 0,1,2,3
    
    AUTHOR
    ----------
    Shreeja Dahal
    
    DATE
    ----------
    09/29/2021 8:30 AM
    
    """
    if arg == "Minimal Depression":
        return 0
    elif arg == 'Mild Depression':
        return 1
    elif arg == 'Moderate Depression':
        return 2
    elif arg == 'Severe Depression':
        return 3

dataset = dataset.drop('Total_score', 1)    
dataset['Diagnosis'] = dataset['Diagnosis'].apply(change)



(432, 9)


Unnamed: 0,Little interest or pleasure in doing things,"Feeling down, depressed, or hopeless","Trouble falling or staying asleep, or sleeping too much",Feeling tired or having little energy,Poor appetite or overeating,Feeling bad about yourself or that you are a failure or have let yourself or your family down,"Trouble concentrating on things, such as reading the newspaper or watching television",Moving or speaking so slowly that other people could have noticed. Or the opposite being so figety or restless that you have been moving around a lot more than usual,"Thoughts that you would be better off dead, or of hurting yourself",Diagnosis
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,1,1,0,0,0,0,0,0


In [70]:
# X contains all the variables except the Diagnosis
X = dataset.drop('Diagnosis', 1)
# y contains the Diagnosis
y = dataset['Diagnosis']
 # splitting the data into training set and testing set,
# 75% into training data and 25% into testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

In [71]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import tree

# If the sample is completely homogeneous the entropy is zero 
# and if the sample is an equally divided it has entropy of one.
clf = tree.DecisionTreeClassifier(criterion = 'entropy')

# fitting the training data
clf = clf.fit(X_train, y_train)
# y_pred is the predicted data
y_pred = clf.predict(X_test)
#calculating the decision tree accuracy score
decisionTree_score = accuracy_score(y_test, y_pred)
# Accuracy, Precision, Recall, F-1 score, and support 
# are factors that measure the performance of a model
print("Accuracy score: ", decisionTree_score)
print(classification_report(y_test, y_pred))

Accuracy score:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        28
           3       1.00      1.00      1.00        38

    accuracy                           1.00       108
   macro avg       1.00      1.00      1.00       108
weighted avg       1.00      1.00      1.00       108



In [72]:
import pickle
# Saving the model as a serialized object pickle
# named model_depression.pkl
with open('model_depression.pkl', 'wb') as file:
    pickle.dump(clf,file)