In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf

# Regresión Logística

In [5]:
penguins_df = sns.load_dataset("penguins")
penguins_df = penguins_df.dropna()
penguins_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


## Creamos un modelo para determinar el sexo de un pinguino

In [None]:
penguins_df = (
    penguins_df
    .assign(
        numeric_sex = lambda df: df.sex.replace(["Female", "Male"],[0, 1])
    )
)
penguins_df

## Regresión logística

In [None]:
model_sex = smf.logit(
    data=penguins_df,
    formula="numeric_sex ~ C(island) +	bill_length_mm + bill_depth_mm	+ flipper_length_mm"
).fit()
model_sex.params

In [25]:
is_male_predictions = pd.DataFrame(
    dict(
        actual_value = penguins_df.numeric_sex,
        predicted_values = model_sex.predict().round()
    )
)
is_male_predictions

Unnamed: 0,actual_value,predicted_values
0,1,0.0
1,0,0.0
2,0,0.0
4,0,1.0
5,1,1.0
...,...,...
338,0,0.0
340,0,0.0
341,1,1.0
342,0,0.0


## Regresión Logística para clasificar Especies

In [None]:
penguins_df.value_counts(["island","sex","species"]).reset_index()

In [19]:
penguins_df.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [30]:
penguins_df = (
    penguins_df.
    assign(
        is_adelie= lambda df: df.species.replace(["Adelie", "Chinstrap", "Gentoo"],[1,0,0])
    )
)
penguins_df

  is_adelie= lambda df: df.species.replace(["Adelie", "Chinstrap", "Gentoo"],[1,0,0])


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,numeric_sex,is_adelie
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,1,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,0,1
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,0,1
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,0,1
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,1,1
...,...,...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female,0,0
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female,0,0
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male,1,0
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female,0,0


In [31]:
model_is_adelie = smf.logit(
    data=penguins_df,
    formula = "is_adelie ~ flipper_length_mm + C(sex)"
).fit()

Optimization terminated successfully.
         Current function value: 0.355225
         Iterations 8


In [32]:
is_adelie_predictions = pd.DataFrame(
    dict(
        actual_value = penguins_df.is_adelie,
        predicted_value = model_is_adelie.predict().round()
    )
)
is_adelie_predictions

Unnamed: 0,actual_value,predicted_value
0,1,1.0
1,1,1.0
2,1,0.0
4,1,1.0
5,1,1.0
...,...,...
338,0,0.0
340,0,0.0
341,0,0.0
342,0,0.0


In [33]:
(
    is_adelie_predictions
    .value_counts(["actual_value","predicted_value"])
    .reset_index()
)

Unnamed: 0,actual_value,predicted_value,count
0,0,0.0,151
1,1,1.0,129
2,0,1.0,36
3,1,0.0,17
