In [1]:
from data_cleaning import Preparation
import pandas as pd
from json import load
import numpy as np
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
df = pd.read_csv("COVIDiSTRESS_April_27_clean.csv", encoding= 'unicode_escape')
prep = Preparation()
df = prep.clean(df)
df = prep.selection_alteration(df)
df = df.set_index(np.arange(df.shape[0]))

In [3]:
df["Dem_male"] = (df["Dem_gender"]=="Male").astype("int")
df["Dem_female"] = (df["Dem_gender"]=="Female").astype("int")
df.drop("Dem_gender", axis="columns", inplace=True)

In [4]:
edu = {
    "None": 0,
    "Up to 6 years of school": 1,
    "Up to 9 years of school": 2,
    "Up to 12 years of school": 3,
    "Some College, short continuing education or equivalent": 4,
    "College degree, bachelor, master": 5,
    "PhD/Doctorate": 6
}
df["Dem_edu"] = df["Dem_edu"].apply(lambda x:edu[x])

In [5]:
ohe_empl = OneHotEncoder(sparse=False)
empl = ohe_empl.fit_transform(df["Dem_employment"].to_numpy().reshape(-1, 1))
empl = pd.DataFrame(empl)
empl.rename(columns=dict(enumerate(map(lambda x:x[3:], ohe_empl.get_feature_names()))), inplace = True)
df = df.merge(empl, left_index = True, right_index=True)
df.drop("Dem_employment", axis="columns", inplace=True)

In [6]:
df["Dem_Expat"] = (df["Dem_Expat"]=="yes").astype("int")

In [7]:
ohe_marital = OneHotEncoder(sparse=False)
marit = ohe_marital.fit_transform(df["Dem_maritalstatus"].to_numpy().reshape(-1, 1))
marit = pd.DataFrame(marit)
marit.rename(columns=dict(enumerate(map(lambda x:x[3:], ohe_marital.get_feature_names()))), inplace = True)
df = df.merge(marit, left_index = True, right_index=True)
df.drop(["Dem_maritalstatus", "Other or would rather not say"], axis="columns", inplace=True)

In [8]:
risk_temp = (df["Dem_riskgroup"]=="Yes").astype("float")
df["Dem_riskgroup"] = risk_temp + (df["Dem_riskgroup"]=="Not sure").astype("float")*0.5

In [9]:
ohe_isol = OneHotEncoder(sparse=False)
isol = ohe_isol.fit_transform(df["Dem_islolation"].to_numpy().reshape(-1, 1))
isol = pd.DataFrame(isol)
isol.rename(columns=dict(enumerate(map(lambda x:x[3:], ohe_isol.get_feature_names()))), inplace = True)
df = df.merge(isol, left_index = True, right_index=True)
df.drop("Dem_islolation", axis="columns", inplace=True)

In [10]:
for col in df.columns:
    df[col] = df[col].astype("float")

In [11]:
df["PSS10_avg"] = df["PSS10_avg"]/5.0
df["latitude"] = df["latitude"]/90.0
df["longitude"] = df["longitude"]/180.0
df["Lon_avg"] = df["Lon_avg"]/5.0
df["Trust_countrymeasure"] = df["Trust_countrymeasure"]/10
df["Compliance"] = df["Compliance"]/6.0
df["Dem_edu"] = df["Dem_edu"]/6.0
age_scaler = StandardScaler()
df["Dem_age"] = age_scaler.fit_transform(df["Dem_age"].to_numpy().reshape(-1, 1)).flatten()
df["Dem_dependents"] = np.tanh(df["Dem_dependents"]/3)
for col in df.columns:
    if "Corona" in col:
        df[col] = df[col]/6.0
for col in df.columns:
    if "Expl_Distress" in col:
        df[col] = df[col]/6.0

In [12]:
import pickle

with open("var/ohe_empl.pkl", "wb") as f:
    pickle.dump(ohe_empl, f)

with open("var/ohe_marital.pkl", "wb") as f:
    pickle.dump(ohe_marital, f)

with open("var/ohe_isol.pkl", "wb") as f:
    pickle.dump(ohe_isol, f)

with open("var/age_scaler.pkl", "wb") as f:
    pickle.dump(age_scaler, f)