# Logit Regression Laboratory

Un estudio sobre condiciones dermatológicas con datos del ICM Machine Learning Repository.

In [119]:
# Above sentences help auto reload modules 

%load_ext autoreload
%autoreload 2

# Import system libraries

from typing import List
import importlib
import numpy
import pandas as pd
import plotly.express as px

# Import custom libraries

import sys
sys.path.append('./src/cleaning')
sys.path.append('./src/learning')

import OutlinerUtils
import MissingValues
import Preprocess
import Distances
import Categorization

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [133]:
# Here we store all global variables

class Store:
    tag_column: str = "Class"
    
    data_raw: pd.DataFrame = None
    data_tmp: pd.DataFrame = None

    data_train: pd.DataFrame = None
    data_test: pd.DataFrame = None
    
    missing_indexes: List[int] = []
    outliners_indexes: List[int] = []
    
    tags: List[str] = []

store = Store()

In [134]:
# Import data

store.data_raw = pd.read_csv('./data/dermatology/dermatology-with-labels.data')
store.data_raw

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,C25,C26,C27,C28,C29,C30,C31,C32,C33,Class
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,2,0,25,4
362,3,2,1,0,1,0,0,0,0,0,...,1,0,1,0,0,0,2,0,36,4
363,3,2,2,2,3,2,0,2,0,0,...,0,3,0,3,0,0,2,3,28,3
364,2,1,3,1,2,3,0,2,0,0,...,0,2,0,1,0,0,2,3,50,3


## Data Cleaning

In [135]:
# Get indexes for rows with missing values

store.missing_indexes = MissingValues.get_missing_values_indexes(store.data_raw)
print (f'Filas con valores faltantes: {len(store.missing_indexes)}')
print (store.missing_indexes)

Filas con valores faltantes: 8
[33, 34, 35, 36, 262, 263, 264, 265]


In [136]:
# Remove rows with missing values

store.data_tmp = store.data_raw.copy(True)
store.data_tmp = store.data_tmp.drop(labels=store.missing_indexes, axis=0)

# Make sure all rows are numeric types

store.data_tmp = store.data_tmp.astype({'C33': int})

# Get outliners

store.outliners_indexes = OutlinerUtils.get_outliners_indexes(store.data_tmp)
print (f'Valores atípicos: {len(store.outliners_indexes)}')

Valores atípicos: 315


## Data Preprocessing

In [138]:
# Group by tag

store.tags = [1, 2, 3, 4, 5, 6]
data_groups = Preprocess.separate_data_by_tags(
    store.data_tmp,
    store.tags,
    store.tag_column
)

for tag in store.tags:
    print(f"Clase {tag} contiene {len(data_groups[tag].index)} registros")

Clase 1 contiene 111 registros
Clase 2 contiene 60 registros
Clase 3 contiene 71 registros
Clase 4 contiene 48 registros
Clase 5 contiene 48 registros
Clase 6 contiene 20 registros


In [140]:
Preprocess.balance_data_by_dropping_rows(
    store.data_tmp,
    store.tags,
    data_groups
)

count = 0
for tag in store.tags:
    print (f"Clase {tag} contiene {len(data_groups[tag].index)} registros")
    count += len(data_groups[tag].index)

print (f"{count} registros en total")

Clase 1 contiene 20 registros
Clase 2 contiene 20 registros
Clase 3 contiene 20 registros
Clase 4 contiene 20 registros
Clase 5 contiene 20 registros
Clase 6 contiene 20 registros
120 registros en total


In [141]:
store.data_train, store.data_test = Preprocess.get_training_and_testing_groups(
    0.7,
    store.tags,
    data_groups
)

print (f"data_train {len(store.data_train.index)} filas. Aprox ~{int(len(store.data_train.index) / len(tags))} filas por clase.")
print (f"data_test {len(store.data_test.index)} filas. Aprox ~{int(len(store.data_test.index) / len(tags))} filas por clase.")
print (f"{len(store.data_train.index) + len(store.data_test.index)} registros en total")

data_train 84 filas. Aprox ~14 filas por clase.
data_test 36 filas. Aprox ~6 filas por clase.
120 registros en total


  return bound(*args, **kwds)


## Classification Model

In [142]:
point = pd.DataFrame([store.data_train.iloc[15]])

pointDistances = Distances.sort_distances(
    Distances.point_distances_against_others(
        point,
        store.data_train,
        store.tag_column,
        []
    )
)

for i, dist in enumerate(pointDistances):
    if i > 10:
        break
    print (f"Distancia a punto {dist.index}: {dist.distance}")

Distancia a punto 15: 0.0
Distancia a punto 24: 4.123105625617661
Distancia a punto 52: 4.795831523312719
Distancia a punto 47: 4.898979485566356
Distancia a punto 21: 5.0990195135927845
Distancia a punto 53: 5.196152422706632
Distancia a punto 48: 5.916079783099616
Distancia a punto 27: 6.164414002968976
Distancia a punto 42: 6.557438524302
Distancia a punto 28: 7.483314773547883
Distancia a punto 34: 7.810249675906654


In [115]:
dist_groups = Distances.group_point_distances_by_tags(pointDistances, tags)
Distances.print_contestants_side_by_side (dist_groups, tags, 10)

t: 1.000  2.000  3.000  4.000  5.000  6.000 
0  8.832  0.000  7.483  4.796  8.246  18.385 
1  8.888  4.123  7.810  4.899  8.307  19.079 
2  9.165  5.099  8.307  5.196  9.000  19.157 
3  9.747  6.164  8.485  5.916  9.055  19.442 
4  9.899  8.367  8.775  6.557  9.539  20.952 
5  18.083  10.247  9.381  8.888  11.705  21.024 
6  19.053  11.269  9.849  12.450  12.923  21.307 
7  22.738  12.961  9.899  16.492  13.038  22.825 
8  22.738  14.595  12.490  18.385  13.491  23.324 
9  24.515  17.205  16.031  18.547  21.190  23.345 


In [147]:
point = pd.DataFrame([store.data_test.iloc[0]])
print (point)

Categorization.categorize_point(
    store.data_train,
    point,
    10, # k
    store.tags,
    store.tag_column
)

     C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  ...  C25  C26  C27  C28  C29  \
352   3   2   2   3   2   0   0   0   2   3  ...    2    0    0    0    0   

     C30  C31  C32  C33  Class  
352    0    1    0   55      1  

[1 rows x 35 columns]


1