In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pickle

In [4]:
# Reading the PMDD data
dataset = pd.read_excel('PMDD_SCREENING.xlsx')
dataset.reset_index()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 16 columns):
 #   Column                                                                           Non-Null Count  Dtype 
---  ------                                                                           --------------  ----- 
 0   feelings of sadness or hopelessness                                              98 non-null     int64 
 1   feelings of anxiety or tension                                                   98 non-null     int64 
 2   mood changes or increased sensitivity                                            98 non-null     int64 
 3   feelings of anger or irritability                                                98 non-null     int64 
 4   total_1                                                                          98 non-null     int64 
 5   apathy to routine activities, which may be associated with social withdrawal     98 non-null     int64 
 6   difficulty concentra

In [5]:
dataset.head(5)

Unnamed: 0,feelings of sadness or hopelessness,feelings of anxiety or tension,mood changes or increased sensitivity,feelings of anger or irritability,total_1,"apathy to routine activities, which may be associated with social withdrawal",difficulty concentrating,fatigue,changes in appetite,"sleeping problems, whether excessive sleeping (hypersomnia) or insomnia",feeling overwhelmed or having a sense of a lack of control,Total_2,Are these symptoms present a week before the onset of menses,Do these symptoms resolve after the start and within the first few days of flow,Do these symptoms interfere with normal daily living,Diagnosis
0,1,1,1,1,4,1,1,1,1,1,1,10,1,1,1,Yes
1,1,1,1,0,3,1,1,1,1,1,1,9,1,1,0,Yes
2,1,1,0,1,3,1,1,1,1,1,0,8,1,1,1,Yes
3,1,0,0,0,1,1,1,1,1,0,0,5,1,1,1,Yes
4,0,1,1,1,3,1,1,1,0,0,0,6,1,1,1,Yes


In [9]:
def change(arg):
    """Changes the arg into numeric value.
    
    PARAMETERS
    -----------
    arg
        Yes or No Diagnosis
    
    RETURNS
    ----------
    number
        either 0 or 1
    
    AUTHOR
    ----------
    Shreeja Dahal
    
    DATE
    ----------
    09/29/2021 8:30 AM
    
    """
    if arg == "Yes":
        return 1
    elif arg == 'No':
        return 0

    
dataset = dataset.drop('total_1', 1)   
dataset = dataset.drop('Total_2', 1)
dataset['Diagnosis'] = dataset['Diagnosis'].apply(change)


In [13]:
# X contains all the variables except the Diagnosis
X = dataset.drop('Diagnosis', 1)
# y contains the Diagnosis
y = dataset['Diagnosis']
# splitting the data into training set and testing set,
# 75% into training data and 25% into testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 47, test_size = 0.25)


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn import tree

# If the sample is completely homogeneous the entropy is zero 
# and if the sample is an equally divided it has entropy of one.
clf = tree.DecisionTreeClassifier(criterion = 'entropy')

# fitting the training data
clf = clf.fit(X_train, y_train)

# y_pred is the predicted data
y_pred = clf.predict(X_test)

#calculating the decision tree accuracy score
decisionTree_score = accuracy_score(y_test, y_pred)

# Accuracy, Precision, Recall, F-1 score, and support 
# are factors that measure the performance of a model
print("Accuracy score: ", decisionTree_score)
print(classification_report(y_test, y_pred))

Accuracy score:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00        16

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



In [13]:
import pickle
# Saving the model as a serialized object pickle
# named model_anxiety.pkl
with open('model_pmdd.pkl', 'wb') as file:
    pickle.dump(clf,file)