In [None]:
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
!python3 -m pip install pyreadstat

In [None]:
!python3 -m pip install xlrd

In [None]:
text_to_val_df = pd.read_excel("../../data/Variablenwerte.xls", names=["Name", "Value", "Text"], header=1)
# Fill all names
for i in range(len(text_to_val_df)):
    if pd.isna(text_to_val_df.loc[i, "Name"]):
        text_to_val_df.loc[i, "Name"] = text_to_val_df.loc[i - 1, "Name"]
# make values ints
for i in range(len(text_to_val_df)):
    val = text_to_val_df.loc[i, "Value"]
    if val == ",00":
        val = 0
    elif val == "1,00":
        val = 1
    else:
        val = str(val).split(",")[0].replace(",", "")
        if val == "":
            print(i)
            val = -1
        else:
            val = int(val)
    text_to_val_df.loc[i, "Value"] = val
text_to_val_df

In [None]:
df = pd.read_spss("../../data/f20.0251z_290620.sav")
# replace strings by values using Variablenwerte.xls
replace_dict = {name: {row["Text"]: row["Value"] for _, row in text_to_val_df[text_to_val_df["Name"] == name].iterrows()} for name in text_to_val_df["Name"].unique()}
df = df.replace(replace_dict)
# replace empty rows by NaN
df = df.replace({"": np.nan, " ": np.nan})
# remove "offen" fields
df = df[[col for col in df if "offen" not in col]]
df

# Preprocessing functions

In [None]:
sys.path.insert(0, '..') 

import constants

## Replace "don't know"

In [None]:
def dontknow_to_mean(df, columns):
    for c in columns:
        print(c, sum(df[c] == max(df[c])))
        df[c] = df[c].replace({max(df[c]) : round(df[c].mean())})
        print()
    return df

In [None]:
def dontknow_to_lowest(df, columns):
    for c in columns:
        print(c, sum(df[c] == max(df[c])))
        df[c] = df[c].replace({max(df[c]) : min(df[c])})
        print()
    return df

In [None]:
df = dontknow_to_mean(df, constants.ordinal_questions)

In [None]:
df = dontknow_to_lowest(df, constants.preconditions_when)

In [None]:
for c in constants.preconditions_when:
    df[c].hist()

## Replace numeric placeholders in interval-scale questions (99, 999, etc.)

In [None]:
def replace_nan_placeholder(df, text_to_val_df, columns):
    # replace 99, 999 etc. by np.nan
    for c in columns:
        nan_placeholder = text_to_val_df.loc[text_to_val_df['Name'] == c, 'Value'].to_list()[0]
        if c in df.columns:
            df[c] = df[c].replace(nan_placeholder, np.nan)
    return df

df = replace_nan_placeholder(df, text_to_val_df, constants.interval_questions)

## Apply one-hot encoding where necessary

In [None]:
def one_hot(df, columns):
    for c in columns:
        if c not in df.columns:
            print(f"{c} not in columns")
            break
        dummies = pd.get_dummies(df[c])
        dummies.columns = [f"{c}_{int(val)}" for val in sorted(dummies.columns)]
        # also if nan, set last dummy to 1 (= "weiß nicht/ kA")
        if df[c].isna().sum() > 0:
            dummies.loc[df[c].isna(), dummies.columns[-1]] = 1
        df = df.drop(c, axis=1)
        df = pd.concat([df, dummies], axis=1)
    return df
    
for l in [constants.to_one_hot, constants.expect_change, constants.reduced_income, constants.age_kids, constants.not_always_applicable]:
    df = one_hot(df, l)

In [None]:
#df.select_dtypes(include=['category'])

In [None]:
import matplotlib.pyplot as plt
for c in constants.ordinal_questions:
    plt.figure()
    df[c].hist()
    plt.title(c)
    print(df[c].describe())



## Cast "category" variables to float

In [None]:
def convert_to_float(df, columns):
    for c in columns:
        df[c] = df[c].astype(float)
    return df

#df = convert_to_int(df, constants.to_float_variables)

df = convert_to_float(df, df.select_dtypes(include=['category']).columns)

## Remove variables with very small variance

In [None]:
def handle_low_std_variables(df, threshold=0.02):
    low_var_found = [c for c in df.columns if df[c].std() < 0.01]
    if len(low_var_found) == 0:
        return df
    #print(f"The following variables have a std below {threshold}:\n{low_var_found}")
    for c in low_var_found:
        if len(set(df[c].dropna())) == 1:
            df[c] = df[c].fillna(0)
        else:
            print(f"Removing {c} with value counts: \n{df[c].value_counts()}")
            df = df.drop(c, axis=1)
    return df

In [None]:
#df = handle_low_std_variables(df)

In [None]:
# detect all non-int and non-float cols
"""
for col in df:
    if df[col].dtype not in ("float64", "int64"):
        print(col, df[col].dtype)
        non_na = df[col][~df[col].isna()]
        print("Len: ", len(non_na))
        print(non_na.head(3))
        if len(non_na.unique()) < 10:
            print("unique distrs:")
            for val in non_na.unique():
                print(val)
                print((non_na == val).mean())
        print()
"""

### Make sure that the only remaining nans are within the interval questions (start with 'n')

In [None]:
has_nan = []
counter = 0
for c in df.columns:
    if df[c].isna().sum() > 0:
        print(c, df[c].isna().sum())
        counter += 1
        has_nan.append(c)
        
print(counter, "variables with nans")

In [None]:
for c in constants.interval_questions:
    plt.figure()
    df[c].hist()
    plt.title(c)

## Investigate possible surrogate measures

In [None]:
df[constants.family_positive]

In [None]:
# family member in household tested positive (f4a_1), but no contact to positively tested person (f1a_1)
df.loc[(df['f4a_1'] == 1) & (df['f1a_1'] != 1), 'f4a_1']

In [None]:
# family member in household tested positive (f4a_1), and also contact to positively tested person (f1a_1)
df.loc[(df['f4a_1'] == 1) & (df['f1a_1'] == 1), 'f4a_1']

In [None]:
df.loc[:, constants.compound_label_cols_incl_diagnosed].describe()

In [None]:
def create_compound_label(df, columns):
    pos_class_idcs = df.index[df[columns].any(axis=1)].tolist()
    df["target"] = np.zeros(len(df))
    df.loc[pos_class_idcs, "target"] = 1.0
    print(f"Number of cases with positive target {len(pos_class_idcs)}")
    return df


In [None]:
df = create_compound_label(df, constants.compound_label_cols_incl_diagnosed)


In [None]:
df = create_compound_label(df, constants.compound_label_cols_only_tested )


In [None]:
df["f1a_1"].sum() 

In [None]:
df["f1a_2"].sum()

In [None]:
df["f1a_4"].sum()

In [None]:
(df["f1a_1"] + df["f1a_2"] + df["f1a_3"] + df["f1a_4"]).astype(bool).sum()

In [None]:
(df["f1b_1"] + df["f1b_2"] + df["f1b_3"] + df["f1b_4"]).astype(bool).sum()

In [None]:
(df["f1a_1"] + df["f1b_1"]).astype(bool).sum()

In [None]:
((df["f2a"] == 1).astype(int) + (df["f2b"] == 1).astype(int) + df["f1a_1"] + df["f1b_1"]).astype(bool).sum()

In [None]:
df.describe()
    

In [None]:
len(df) - (28080 + 1906)

In [None]:
df

In [None]:
len(df)

In [None]:
pred_df = df.copy()

In [None]:
pred_df["target"] = ((df["f2a"] == 1).astype(int) + (df["f2b"] == 1).astype(int) + df["f1a_1"] + df["f1b_1"]).astype(bool)
drop_cols = [col for col in pred_df if "f1a" in col or "f2a" in col or "f2b" in col or "f1b" in col]
pred_df = pred_df.drop(columns=drop_cols)

In [None]:
pred_df = pred_df.fillna(pred_df.median())

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [None]:
from sklearn.model_selection import train_test_split
x = pred_df.drop(columns=["target", "sernr"])
labels = x.columns
y = pred_df["target"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [None]:
clf.fit(x_train, y_train)

In [None]:
preds_test = clf.predict(x_test)

In [None]:
(preds_test == y_test).mean()

In [None]:
1 - y_test.mean()

In [None]:
df_imps = pd.DataFrame({"imp": clf.feature_importances_, "feature": labels})

In [None]:
df_imps.sort_values("imp")

In [None]:
col_name = "f1a_5"
col = df[col]
print(col.iloc[0])
print(col.head())
print(col.tail())
print()
print(col.describe())
print()
if len(col.unique()) < 10:
    print("unique distrs:")
    for val in col.unique():
        print(val)
        print((col == val).mean())