In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as sc
import io
import requests
import statsmodels.api as sm

url1 = "https://raw.githubusercontent.com/SzMatej/IAU_2020-2021/main/65/other_train.csv"
url2 = "https://raw.githubusercontent.com/SzMatej/IAU_2020-2021/main/65/personal_train.csv"

db1 = requests.get(url1).content
db2 = requests.get(url2).content
train = pd.read_csv(io.StringIO(db1.decode('utf-8')))
other = pd.read_csv(io.StringIO(db2.decode('utf-8')))

In [68]:
data = pd.merge(train,other,on=['name','address'], how = 'outer')
data_bf = data

print(len(data))

3983


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3983 entries, 0 to 3982
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0_x    3983 non-null   int64  
 1   name            3983 non-null   object 
 2   address         3983 non-null   object 
 3   race            3960 non-null   object 
 4   marital-status  3967 non-null   object 
 5   occupation      3962 non-null   object 
 6   pregnant        3969 non-null   object 
 7   education-num   3569 non-null   float64
 8   relationship    3967 non-null   object 
 9   capital-gain    3963 non-null   float64
 10  education       3968 non-null   object 
 11  fnlwgt          3965 non-null   float64
 12  class           3964 non-null   float64
 13  income          3968 non-null   object 
 14  medical_info    3965 non-null   object 
 15  native-country  3969 non-null   object 
 16  hours-per-week  3961 non-null   float64
 17  capital-loss    3968 non-null   f

### atribút pregnant

Upravíme boolean hodnoty v datasete tak že ich nahradíme číselnou reprezentáciou.
Taktiež nastavíme všetky chodnoty u mužov na 0.

In [49]:
def sanitize_boolean(boolean):
    try:
        if boolean.strip() in ['f','F','FALSE','false','False']:
            return 0
        elif boolean.strip() in ['t','T','TRUE','true','True']:
            return 1
        else:
            return np.nan
    except AttributeError:
        return np.nan

In [50]:
data.pregnant = data.pregnant.map(sanitize_boolean)

In [51]:
def sanitize_pregnancy(data):
    data.loc[(data.sex == 1),'pregnant'] = 0
    return other

data = sanitize_pregnancy(data)

### atribút sex

Tento atribút neobsahuje žiadne prázdne hodnoty. Pri tomto atribúte treba len dané hodnoty prekonvertovať na numerické hodnoty.

In [53]:
data.sex.unique()

array([' Female', ' Male'], dtype=object)

In [55]:
def sanitize_sex(sex):
    return 1 if sex.strip() == 'Male' else 0

In [56]:
data.sex = data.sex.map(lambda sex: sanitize_sex(sex))

In [57]:
data.sex.unique()

array([0, 1], dtype=int64)

### atribút age

Pri tomto atribúte nahradíme záporné hodnoty na NaN. Takisto sa tu nachádzajú aj neznáme hodnoty ktoré sú reprezentované '??'. Tieto hodnoty taktiež nahradíme hodnotami NaN.

In [58]:
data.age.unique()

array(['52', '46', '41', '57', '63', '54', '53', '??', '49', nan, '30',
       '55', '87', '42', '50', '33', '59', '65', '58', '51', '93', '40',
       '68', '43', '34', '78', '71', '23', '67', '39', '81', '38', '64',
       '44', '31', '48', '60', '35', '45', '12', '61', '62', '36', '70',
       '37', '56', '69', '86', '47', '26', '73', '21', '27', '19', '82',
       '66', '16', '99', '113', '25', '76', '29', '17', '32', '74', '80',
       '77', '28', '14', '22', '75', '79', '72', '18', '20', '24', '83',
       '85', '84', '3', '9', '15', '90', '7', '-1'], dtype=object)

### atribút medical info

Pri tomto atribúte treba vyparsovať hodnoty jedného riadku a následne vytvoriť nové stĺpce do ktorých sa táto vyparsovaná hodnota doplní

In [71]:
data.medical_info.unique()

array(["{'mean_glucose':'111.8125','std_glucose':'44.88174566','kurtosis_glucose':'0.423867091','skewness_glucose':'0.239944007','mean_oxygen':'2.465719064','std_oxygen':'17.28981747','kurtosis_oxygen':'8.636118173','skewness_oxygen':'85.60462065'}",
       "{'mean_glucose':'71.3984375','std_glucose':'47.29517349','kurtosis_glucose':'1.317459045','skewness_glucose':'2.340411994','mean_oxygen':'17.114548499999998','std_oxygen':'46.86283003','kurtosis_oxygen':'3.070346153','skewness_oxygen':'9.405187883'}",
       "{'mean_glucose':'102.796875','std_glucose':'37.5346422','kurtosis_glucose':'0.382097319','skewness_glucose':'1.3556516','mean_oxygen':'2.706521739','std_oxygen':'19.87410184','kurtosis_oxygen':'7.955209532','skewness_oxygen':'67.64902246'}",
       ...,
       "{'mean_glucose':'129.90625','std_glucose':'48.62217818','kurtosis_glucose':'0.038773097','skewness_glucose':'-0.17251288','mean_oxygen':'3.319397993','std_oxygen':'22.72505413','kurtosis_oxygen':'8.750496392','skewness_

### atribút date of birth

Tento atribút nemá nejaký jednotný tvar, čiže je nutné to ošetriť

In [74]:
data.date_of_birth.unique()

array(['1966-05-16', '1964-06-29', '54-01-25', ..., '1988-09-29',
       '1958/12/17', '1959-09-29'], dtype=object)

In [75]:
def sanitize_date(date):
    date = str(date).replace('/', '-')
    date = date[:10]
    date = date.split("-")
    
    if date[0] != 'nan':
        if len(date[0]) != 4:
            if len(date[2]) == 2 and int(date[0]) > 31:
                new_date = "19"+ date[0] +"-"+date[1]+"-"+date[2] 
                
            elif ((len(date[2]) == 2) and (int(date[0]) < 31) and (int(date[2]) > 31)):
                new_date = "19"+date[2]+"-"+date[1]+"-"+date[0] 
                
            elif ((len(date[2]) == 2) and (int(date[0]) < 31) and (int(date[2]) < 31)):
                new_date = "20"+ date[2] + "-" +date[1]+"-" + date[0] 
            else:
                new_date = date[2]+"-"+date[1]+"-"+date[0] 
            return new_date        
    return '-'.join(date)

In [76]:
data.date_of_birth = data.date_of_birth.map(sanitize_date)
data.date_of_birth.head(5)

0    1966-05-16
1    1964-06-29
2    1954-01-25
3    1952-10-30
4    1982-01-13
Name: date_of_birth, dtype: object

In [77]:
data.date_of_birth.unique()

array(['1966-05-16', '1964-06-29', '1954-01-25', ..., '1988-09-29',
       '1958-12-17', '1959-09-29'], dtype=object)

Teraz má tento atribúť jednotný tvar hodnôt. teraz môžme upraviť atribút age tak, že pomocou dátumu vypočítame chýbajúce hodnoty