# Logit Regression Laboratory

Un estudio sobre condiciones dermatológicas con datos del ICM Machine Learning Repository.

In [2]:
# Above sentences help auto reload modules 

%load_ext autoreload
%autoreload 2

# Import system libraries

import importlib
import numpy
import pandas as pd
import plotly.express as px

# Import custom libraries

import src.cleaning.OutlinerUtils as OutlinerUtils
import src.cleaning.MissingValues as MissingValues
import src.learning.Preprocess as Preprocess

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Here we store all global variables

class Store:
    data_raw = None
    data_tmp = None
    missing_indexes = []
    outliners_indexes = []

store = Store()

In [4]:
# Import data

store.data_raw = pd.read_csv('./data/dermatology/dermatology-with-labels.data')
store.data_raw

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C25,C26,C27,C28,C29,C30,C31,C32,C33,Class
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,2,0,25,4
362,3,2,1,0,1,0,0,0,0,0,...,1,0,1,0,0,0,2,0,36,4
363,3,2,2,2,3,2,0,2,0,0,...,0,3,0,3,0,0,2,3,28,3
364,2,1,3,1,2,3,0,2,0,0,...,0,2,0,1,0,0,2,3,50,3


In [5]:
# Get indexes for rows with missing values

store.missing_indexes = MissingValues.get_missing_values_indexes(store.data_raw)
print (f'Filas con valores faltantes: {len(store.missing_indexes)}')
print (store.missing_indexes)

Filas con valores faltantes: 8
[33, 34, 35, 36, 262, 263, 264, 265]


In [6]:
# Remove rows with missing values

store.data_tmp = store.data_raw.copy(True)
store.data_tmp = store.data_tmp.drop(labels=store.missing_indexes, axis=0)

# Make sure all rows are numeric types

store.data_tmp = store.data_tmp.astype({'C33': int})

# Get outliners

outliners = OutlinerUtils.get_outliners_indexes(store.data_tmp)
print (f'Valores atípicos: {len(outliners)}')

Valores atípicos: 315


In [7]:
# Group by tag

tags = [1, 2, 3, 4, 5, 6]
data_groups = Preprocess.separate_data_by_tags(
    store.data_tmp,
    tags,
    "Class"
)

for tag in tags:
    print(f"Clase {tag} contiene {len(data_groups[tag].index)} registros")

Clase 1 contiene 111 registros
Clase 2 contiene 60 registros
Clase 3 contiene 71 registros
Clase 4 contiene 48 registros
Clase 5 contiene 48 registros
Clase 6 contiene 20 registros


In [None]:
Preprocess.balance_data_by_dropping_rows(
    store.data_tmp,
    tags,
    data_groups
)

for tag in tags:
    print(f"Clase {tag} contiene {len(data_groups[tag].index)} registros")

In [None]:
print(1)