In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Reading the anxiety screening data
dataset = pd.read_excel('Anxiety_Screening.xlsx')
dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53 entries, 0 to 52
Data columns (total 9 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   Feeling nervous, anxious or on edge                53 non-null     int64 
 1   Not being able to stop or control worrying         53 non-null     int64 
 2   Worrying too much about different things           53 non-null     int64 
 3   Trouble relaxing                                   53 non-null     int64 
 4   Being so restless that it is hard to sit stil      53 non-null     int64 
 5   Becoming easily annoyed or irritable               53 non-null     int64 
 6   Feeling afraid as if something awful might happen  53 non-null     int64 
 7   Total_score                                        53 non-null     int64 
 8   Diagnosis                                          53 non-null     object
dtypes: int64(8), object(1)
m

In [3]:
def change(arg):
    """Changes the arg into numeric value.
    
    PARAMETERS
    -----------
    arg
        either Minimal Anxiety, Mild Anxiety, Moderate Anxiety, or Severe Anxiety
    
    RETURNS
    ----------
    number
        either 0,1,2,3
    
    AUTHOR
    ----------
    Shreeja Dahal
    
    DATE
    ----------
    09/29/2021 8:30 AM
    
    """
    if arg == "Minimal Anxiety":
        return 0
    elif arg == 'Mild Anxiety':
        return 1
    elif arg == 'Moderate Anxiety':
        return 2
    elif arg == 'Severe Anxiety':
        return 3


dataset['Diagnosis'] = dataset['Diagnosis'].apply(change)



In [4]:
# X contains all the variables except the Diagnosis
dataset = dataset.drop('Total_score', 1)
X = dataset.drop('Diagnosis', 1)
# y contains the Diagnosis
y = dataset['Diagnosis']
# splitting the data into training set and testing set,
# 75% into training data and 25% into testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import tree


# If the sample is completely homogeneous the entropy is zero 
# and if the sample is an equally divided it has entropy of one.
clf = tree.DecisionTreeClassifier(criterion = 'entropy')

# fitting the training data
clf = clf.fit(X_train, y_train)

# y_pred is the predicted data
y_pred = clf.predict(X_test)

#calculating the decision tree accuracy score
decisionTree_score = accuracy_score(y_test, y_pred)

# Accuracy, Precision, Recall, F-1 score, and support 
# are factors that measure the performance of a model
print("Accuracy score: ", decisionTree_score)
print(classification_report(y_test, y_pred))

Accuracy score:  0.7142857142857143
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       0.67      0.50      0.57         4
           2       0.50      0.50      0.50         4
           3       0.80      1.00      0.89         4

    accuracy                           0.71        14
   macro avg       0.74      0.75      0.74        14
weighted avg       0.70      0.71      0.70        14



In [6]:
# Saving the model as a serialized object pickle
# named model_anxiety.pkl
with open('model_anxiety.pkl', 'wb') as file:
    pickle.dump(clf,file)