In [10]:
# Import libraries
import pandas as pd # needed for most operation
import numpy as np # needed for some array operations
from matplotlib import pyplot as plt #used for plots
import seaborn as sns
import datawig
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from importlib import reload

In [106]:
# Fetch Data
data = pd.read_csv('../data/RA_data.csv')
data.describe()
data.head()

Unnamed: 0,Treatment,Fever,Duration_of_pain,Sick_leave,Earlier_hospitalization,Workoverload,Familiy_history,Depression,Extremely_nervous,Stress,...,Failure_symptoms,Incoordination,neck_pain_intensity,low_back_pain_intensity,arm_left_pain_intensity,arm_right_pain_intensity,leg_left_pain_intensity,leg_right_pain_intensity,working_ability,Paidwork
0,1,1.0,10.0,0,1,,0,0,0.0,0,...,1,,0,8,8,7,6,6,,0
1,3,1.0,10.0,0,1,0.0,1,0,0.0,1,...,1,1.0,7,8,7,4,0,0,7.0,1
2,1,1.0,10.0,0,1,,0,0,6.0,1,...,1,0.0,3,5,0,2,0,3,,0
3,5,1.0,10.0,0,1,,0,0,1.0,1,...,0,,2,9,0,0,9,5,,0
4,1,1.0,10.0,0,1,,0,0,0.0,1,...,1,,0,7,5,0,0,0,,0


In [107]:
# Drop columns
finalData = data.copy()
finalData.drop(labels='Workoverload',axis=1, inplace=True)
finalData.drop(labels='Relationship_with_colleagues',axis=1, inplace=True)
finalData.drop(labels='Trauma',axis=1, inplace=True)
finalData.drop(labels='working_ability',axis=1, inplace=True)

In [108]:
# Remove rows that have Treatment 4
finalData = finalData[finalData['Treatment'] != 4]

In [109]:
# Turn age into numerical value
ageMap = {'0-19': 1, '20-29': 2, '30-39': 3, '40-49': 4, '50-59': 5, '60-69': 6, '70-79': 7, '>=80': 8}
finalData['Age'] = finalData['Age'].map(ageMap)

In [69]:
dfCount = finalData.describe().loc['count']
numRows = finalData.shape[0]

outputs = dfCount.loc[dfCount.values < numRows]
outputs = list(outputs.index)

inputs = dfCount.loc[dfCount.values == numRows]
inputs = list(inputs.index)

In [110]:
finalData = finalData.fillna('')
finalData[outputs[:]] = finalData[outputs[:]].astype(str)

In [None]:
df_train, df_test = train_test_split(finalData, test_size=0.1, random_state=0)
imputedData = finalData
for output in outputs:
    imputer = datawig.SimpleImputer(
        input_columns= inputs,
        output_column= output,
        output_path= "./imputerModels/simpleModel_"+output,
    )

    imputer.fit(train_df=df_train)

In [159]:
# Test imputation accuracy
for output in outputs:
    outputModel = datawig.SimpleImputer.load("./imputerModels/simpleModel_"+output)
    predictions = outputModel.predict(df_test)
    print(output+" scores:")
    #Calculate f1 score
    f1 = metrics.f1_score(predictions[output], predictions[output+'_imputed'], average='weighted')
    #print(f1)

    #Print overall classification report
    print(metrics.classification_report(predictions[output], predictions[output+'_imputed'], zero_division=0))

[14:33:50] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Fever scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         1
         0.0       0.96      0.97      0.97       115
         1.0       0.92      0.89      0.91        38

    accuracy                           0.95       154
   macro avg       0.63      0.62      0.62       154
weighted avg       0.94      0.95      0.94       154



[14:33:51] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Duration_of_pain scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         3
        10.0       0.75      0.95      0.84       114
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         1
         5.0       0.00      0.00      0.00         2
         6.0       0.00      0.00      0.00         2
         7.0       0.17      0.12      0.14         8
         8.0       1.00      0.07      0.13        14
         9.0       0.00      0.00      0.00        10

    accuracy                           0.71       154
   macro avg       0.21      0.13      0.12       154
weighted avg       0.65      0.71      0.64       154



[14:33:51] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Extremely_nervous scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         3
         0.0       0.34      0.96      0.51        48
         1.0       0.00      0.00      0.00        24
        10.0       0.00      0.00      0.00         3
         2.0       0.00      0.00      0.00         8
         3.0       0.00      0.00      0.00         6
         4.0       0.00      0.00      0.00         8
         5.0       0.00      0.00      0.00        11
         6.0       0.00      0.00      0.00         9
         7.0       0.57      0.29      0.38        14
         8.0       0.00      0.00      0.00        18
         9.0       0.00      0.00      0.00         2

    accuracy                           0.32       154
   macro avg       0.08      0.10      0.07       154
weighted avg       0.16      0.32      0.19       154



[14:33:51] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Irrational_thoughts_risk_lasting scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         5
         0.0       0.00      0.00      0.00         0
         1.0       0.00      0.00      0.00         3
        10.0       0.42      0.62      0.50        24
         2.0       0.00      0.00      0.00         3
         3.0       1.00      0.33      0.50         6
         4.0       0.33      0.12      0.18         8
         5.0       0.36      0.81      0.49        26
         6.0       0.33      0.06      0.10        17
         7.0       0.50      0.50      0.50        20
         8.0       0.58      0.42      0.49        26
         9.0       0.27      0.19      0.22        16

    accuracy                           0.42       154
   macro avg       0.32      0.26      0.25       154
weighted avg       0.41      0.42      0.37       154



[14:33:51] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Irrational_thoughts_work scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00        12
         0.0       0.12      0.33      0.18         9
         1.0       0.50      0.09      0.15        11
        10.0       0.64      0.61      0.62        38
         2.0       0.08      0.20      0.12         5
         3.0       0.00      0.00      0.00        11
         4.0       0.00      0.00      0.00         1
         5.0       0.27      0.27      0.27        15
         6.0       0.05      0.12      0.07         8
         7.0       0.36      0.42      0.38        12
         8.0       0.42      0.50      0.46        16
         9.0       0.38      0.31      0.34        16

    accuracy                           0.33       154
   macro avg       0.24      0.24      0.22       154
weighted avg       0.34      0.33      0.32       154



[14:33:52] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Coping_strategy scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         5
         0.0       0.34      0.42      0.38        24
         1.0       0.00      0.00      0.00         4
        10.0       0.00      0.00      0.00         5
         2.0       0.40      0.71      0.51        24
         3.0       0.11      0.06      0.08        16
         4.0       0.31      0.29      0.30        14
         5.0       0.28      0.45      0.35        20
         6.0       0.60      0.18      0.27        17
         7.0       0.45      0.45      0.45        11
         8.0       0.00      0.00      0.00         8
         9.0       0.00      0.00      0.00         6

    accuracy                           0.32       154
   macro avg       0.21      0.21      0.19       154
weighted avg       0.29      0.32      0.28       154



[14:33:52] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Kinesiophobia_physical_exercise scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         2
         0.0       0.00      0.00      0.00         1
         1.0       0.00      0.00      0.00         1
        10.0       0.68      0.87      0.76        53
         2.0       0.00      0.00      0.00         3
         3.0       0.33      0.25      0.29         4
         4.0       0.00      0.00      0.00         2
         5.0       0.00      0.00      0.00        10
         6.0       0.20      0.17      0.18         6
         7.0       0.43      0.43      0.43        14
         8.0       0.50      0.47      0.48        34
         9.0       0.50      0.58      0.54        24

    accuracy                           0.55       154
   macro avg       0.22      0.23      0.22       154
weighted avg       0.48      0.55      0.51       154



[14:33:52] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Kinesiophobia_pain_stop scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         2
         0.0       0.00      0.00      0.00         4
         1.0       0.00      0.00      0.00         2
        10.0       0.52      0.96      0.68        47
         2.0       0.00      0.00      0.00         4
         3.0       0.00      0.00      0.00         6
         4.0       0.00      0.00      0.00         3
         5.0       0.29      0.22      0.25         9
         6.0       0.00      0.00      0.00         6
         7.0       0.50      0.12      0.19        17
         8.0       0.62      0.74      0.68        34
         9.0       0.46      0.30      0.36        20

    accuracy                           0.52       154
   macro avg       0.20      0.19      0.18       154
weighted avg       0.43      0.52      0.44       154



[14:33:53] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Uses_corticosteroids scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         3
         0.0       0.89      0.96      0.93       105
         1.0       0.85      0.76      0.80        46

    accuracy                           0.88       154
   macro avg       0.58      0.57      0.58       154
weighted avg       0.86      0.88      0.87       154



[14:33:53] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Serious_disease scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         2
         0.0       0.90      0.95      0.93       108
         1.0       0.85      0.77      0.81        44

    accuracy                           0.89       154
   macro avg       0.58      0.58      0.58       154
weighted avg       0.88      0.89      0.88       154



[14:33:53] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Weightloss_per_year scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         6
         1.0       0.21      0.17      0.19        24
         2.0       0.67      0.42      0.52        33
         3.0       0.61      0.90      0.72        67
         4.0       0.53      0.33      0.41        24

    accuracy                           0.56       154
   macro avg       0.40      0.36      0.37       154
weighted avg       0.52      0.56      0.52       154



[14:33:53] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


Loss_muscle_strength scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00         3
         0.0       0.80      0.81      0.80        68
         1.0       0.82      0.84      0.83        83

    accuracy                           0.81       154
   macro avg       0.54      0.55      0.55       154
weighted avg       0.80      0.81      0.80       154

Incoordination scores:
              precision    recall  f1-score   support

                   0.00      0.00      0.00        26
         0.0       0.74      0.88      0.80        85
         1.0       0.62      0.74      0.67        43

    accuracy                           0.69       154
   macro avg       0.45      0.54      0.49       154
weighted avg       0.58      0.69      0.63       154



[14:33:54] ../src/executor/graph_executor.cc:1995: Subgraph backend MKLDNN is activated.


In [None]:
# Fill a new data set with the imputed models
imputedData = finalData.copy()
for output in outputs:
    outputModel = datawig.SimpleImputer.load("./imputerModels/simpleModel_"+output)
    predictions = outputModel.predict(finalData)
    imputedData.loc[imputedData[output] == "",output] = predictions.loc[predictions[output] == "",output+"_imputed"]

In [149]:
# Save the data into a file
imputedData.to_pickle('../data/imputedData_1')