<a href="https://colab.research.google.com/github/Sergiodjish/DiplomadoML2023/blob/main/RegresionLog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Regresion Logistica Binomial

#### Paso 1: Librerías

In [1]:
##Importar librerias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

## Funciones

In [20]:
### Función cargar data sea .csv o .xlsx
def importDatos(ruta, archivo, ext):
    if ext == '.csv':
        data = pd.read_csv(ruta+archivo+ext)
        return data
    elif ext == '.xlsx':
        data = pd.read_excel(ruta+archivo+ext)
        return data
    else:
        print('Extensión Diferente')

# Función para reemplazo de valore numéricos con la mediana
def imputacionNa(df, nomCol, param):
    if param == 'media':
        df[nomCol] = df[nomCol].fillna(df[nomCol].mean())
        return df[nomCol]
    else:
        df[nomCol] = df[nomCol].fillna(df[nomCol].median())
        return df[nomCol]

# Función para reeemplazo NA categóricos
def imputacionCatNa(df, nomCol):
    df[nomCol] = df[nomCol].fillna(df[nomCol].mode()[0])
    return df[nomCol]

# Función para datos categóricos modificaciones
def imputacionCat(df, nomCol, busqueda, reempl):
    df[nomCol] = np.where(df[nomCol] == busqueda, reempl, df[nomCol])
    return df[nomCol]

# Función para variables dummies
def convDummies(df, nomCol):
    dummies = pd.get_dummies(df[nomCol])
    data = pd.concat([df, dummies], axis=1)
    data.drop(columns=[nomCol], inplace=True)
    return data

# Funcion para revisar datos categóricos o texto
def revCategoricos(df, nomCol):
    rev = df[nomCol].value_counts()
    return rev
    
## Función para datos NAN -> Null -> 0 (IRQ) -> NAN
def datos_NAN(df):
    if isinstance(df, pd.DataFrame):
        total_na = df.isna().sum().sum()
        print("Dimesiones: %d filas, %d columnas" % (df.shape[0], df.shape[1]))
        print("Total de Valores NAN: %d" % (total_na))
        print("%40s %10s  %10s %10s" % ("Nombre Columna", "Tipo Dato", "#Unicos", "Cant Valores NAN"))
        col_name = df.columns
        dtyp = df.dtypes
        uniq = df.nunique()
        na_val = df.isna().sum()
        for i in range(len(df.columns)):
            print("%40s %10s  %10s %10s" % (col_name[i], dtyp[i], uniq[i], na_val[i]))
    else:
        print("Se esperaba un dataframe: %15s" % (type(df)))

# Función para cambiar datos de texto a numerico float64
def cambioTextNum(df, nomCol, tipo):
    df[nomCol] = df[nomCol].astype(tipo)
    return df[nomCol]

# Función modificar a 1 y 0 los datos categóricos
def camTextBi(df, nomCol, busqueda, camuno, camdos):
    df[nomCol] = np.where(df[nomCol] == busqueda, camuno, camdos)
    return df[nomCol]

##Función de label encoder
def Codificacion(df, nomCol):
    label_e = LabelEncoder()
    df[nomCol] = label_e.fit_transform(df[nomCol])
    return df[nomCol]

##Función multigraficas
def multigraf (df, coluno, coldos):
    g = sns.FacetGrid(df, col=coluno)
    g.map(sns.histplot, coldos)
    return g

##Funcion multigraficas para tres variables
def multigrafTres (df, coluno, coldos, coltres):
    h = sns.FacetGrid(df, col=coluno, height=4, aspect=.5)
    h.map(sns.barplot, coldos, coltres, order=['M','F'])
    return h

##Función de barras y de dispersion
def barras(df, colx, coly, cruce):
    sns.catplot(x=colx, y=coly, hue=cruce, kind='bar', data=df)

def dispersion (df, cruce, num, numdos):
    g = sns.FacetGrid(df, hue=cruce, palette='flare', height=5)
    g.map(sns.scatterplot, num, numdos, s=100, alpha=0.5)
    g.add_legend()
    return g

from sklearn.preprocessing import MinMaxScaler
def normaData(df):
    valores = df.values  
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(valores)

    pd.DataFrame(np.vstack((scaler.data_min_, scaler.data_max_)),
             index=['Min', 'Max'],
             columns=df.columns)

    normalizados = scaler.transform(valores)
    df_norm = pd.DataFrame(normalizados,
                       index=df.index,
                       columns=df.columns)
    return df_norm
    
## Función para hacer la matriz de correlación de manera Tidy
def tidy_corr_matrix(corr_mat):
    '''
    Función para convertir una matriz de correlación de pandas en formato tidy.
    '''
    corr_mat = corr_mat.stack().reset_index()
    corr_mat.columns = ['variable_1','variable_2','r']
    corr_mat = corr_mat.loc[corr_mat['variable_1'] != corr_mat['variable_2'], :]
    corr_mat['abs_r'] = np.abs(corr_mat['r'])
    corr_mat = corr_mat.sort_values('abs_r', ascending=False)
    
    return(corr_mat)

## Paso 2: Importar Dataset

## Análisis inicial

In [6]:
### Cargue de datos
data = importDatos('/content/','weatherAUS','.csv')

data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [10]:
##Valores NAN
datos_NAN(data)

Dimesiones: 145460 filas, 23 columnas
Total de Valores NAN: 31648
                          Nombre Columna  Tipo Dato     #Unicos Cant Valores NAN
                                    Date     object        3436          0
                                Location     object          49          0
                                 MinTemp    float64         389          0
                                 MaxTemp    float64         505          0
                                Rainfall    float64         681          0
                             Evaporation    float64         358          0
                                Sunshine    float64         145          0
                             WindGustDir     object          16      10326
                           WindGustSpeed    float64          67          0
                              WindDir9am     object          16      10566
                              WindDir3pm     object          16       4228
                            

In [9]:
##Limpiar NAN mintemp y max temp
data['MinTemp'] = imputacionNa(data, 'MinTemp', 'Mediana')
data['MaxTemp'] = imputacionNa(data, 'MaxTemp', 'Mediana')
data['Rainfall'] = imputacionNa(data, 'Rainfall', 'Mediana')
data['Evaporation'] = imputacionNa(data, 'Evaporation', 'Mediana')
data['Sunshine'] = imputacionNa(data, 'Sunshine', 'Mediana')
data['WindGustSpeed'] = imputacionNa(data, 'WindGustSpeed', 'Mediana')

data['WindSpeed9am'] = imputacionNa(data, 'WindSpeed9am', 'Mediana')
data['WindSpeed3pm'] = imputacionNa(data, 'WindSpeed3pm', 'Mediana')
data['Humidity9am'] = imputacionNa(data, 'Humidity9am', 'Mediana')
data['Humidity3pm'] = imputacionNa(data, 'Humidity3pm', 'Mediana')
data['Pressure9am'] = imputacionNa(data, 'Pressure9am', 'Mediana')
data['Pressure3pm'] = imputacionNa(data, 'Pressure3pm', 'Mediana')
data['Cloud9am'] = imputacionNa(data, 'Cloud9am', 'Mediana')
data['Cloud3pm'] = imputacionNa(data, 'Cloud3pm', 'Mediana')
data['Temp9am'] = imputacionNa(data, 'Temp9am', 'Mediana')
data['Temp3pm'] = imputacionNa(data, 'Temp3pm', 'Mediana')


In [14]:
##Limpieza Categóricos
data['WindGustDir'] = imputacionCatNa(data, 'WindGustDir')
data['WindDir9am'] = imputacionCatNa(data, 'WindDir9am')
data['WindDir3pm'] = imputacionCatNa(data, 'WindDir3pm')
data['RainToday'] = imputacionCatNa(data, 'RainToday')
data['RainTomorrow'] = imputacionCatNa(data, 'RainTomorrow')

In [15]:
## Hacemos revisión nuevamente para confirmar que no hayan datos NAN
datos_NAN(data)

Dimesiones: 145460 filas, 23 columnas
Total de Valores NAN: 0
                          Nombre Columna  Tipo Dato     #Unicos Cant Valores NAN
                                    Date     object        3436          0
                                Location     object          49          0
                                 MinTemp    float64         389          0
                                 MaxTemp    float64         505          0
                                Rainfall    float64         681          0
                             Evaporation    float64         358          0
                                Sunshine    float64         145          0
                             WindGustDir     object          16          0
                           WindGustSpeed    float64          67          0
                              WindDir9am     object          16          0
                              WindDir3pm     object          16          0
                            Wind

### Re categorización de los valores

In [35]:
##Convertir a Número variables categóricas
dataN = convDummies(dataN, 'Location')

dataN.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Townsville,Tuggeranong,Uluru,WaggaWagga,Walpole,Watsonia,Williamtown,Witchcliffe,Wollongong,Woomera
0,2008-12-01,13.4,22.9,0.6,4.8,8.4,7,44.0,7,3,...,0,0,0,0,0,0,0,0,0,0
1,2008-12-02,7.4,25.1,0.0,4.8,8.4,3,44.0,3,6,...,0,0,0,0,0,0,0,0,0,0
2,2008-12-03,12.9,25.7,0.0,4.8,8.4,6,46.0,7,6,...,0,0,0,0,0,0,0,0,0,0
3,2008-12-04,9.2,28.0,0.0,4.8,8.4,2,24.0,5,0,...,0,0,0,0,0,0,0,0,0,0
4,2008-12-05,17.5,32.3,1.0,4.8,8.4,7,41.0,2,3,...,0,0,0,0,0,0,0,0,0,0


In [16]:
## Generamos una copia de la data original
dataN = data

In [21]:
##Cambiar a North, South, East, West, NW, NE, SE, SW - label encoder
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'SSE','SE')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'WSW','SW')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'SSW','SW')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'WNW','NW')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'ENE','NE')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'ESE','SE')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'NNE','NE')
dataN.WindGustDir = imputacionCat(dataN, 'WindGustDir', 'NNW','NW')

##Label encoder
from sklearn import preprocessing

label_e = preprocessing.LabelEncoder()

dataN.WindGustDir = label_e.fit_transform(dataN.WindGustDir)

revCategoricos(dataN, 'WindGustDir')

6    26772
5    26006
3    22994
2    21785
7    20241
1     9313
0     9181
4     9168
Name: WindGustDir, dtype: int64

In [33]:
##Cambiar a North, South, East, West, NW, NE, SE, SW 
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'SSE','SE')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'WSW','SW')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'SSW','SW')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'WNW','NW')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'ENE','NE')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'ESE','SE')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'NNE','NE')
dataN.WindDir9am = imputacionCat(dataN, 'WindDir9am', 'NNW','NW')

##Label encoder
from sklearn import preprocessing

label_e = preprocessing.LabelEncoder()

dataN.WindDir9am = label_e.fit_transform(dataN.WindDir9am)

revCategoricos(dataN, 'WindDir9am')

5    26029
3    24143
2    23636
6    23034
1    22324
0     9176
4     8659
7     8459
Name: WindDir9am, dtype: int64

In [28]:
##Cambiar a North, South, East, West, NW, NE, SE, SW 
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'SSE', 'SE') 
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'WSW', 'SW')
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'SSW', 'SW')
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'WNW', 'NW')
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'ENE', 'NE')
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'ESE', 'SE')
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'NNE', 'NE')
dataN.WindDir3pm = imputacionCat(dataN, 'WindDir3pm', 'NNW', 'NW')

dataN.WindDir3pm = Codificacion(dataN, 'WindDir3pm')

revCategoricos(dataN, 'WindDir3pm')

5    32970
6    27028
3    25354
2    22710
7    10110
4     9926
1     8890
0     8472
Name: WindDir3pm, dtype: int64

In [29]:
##Binarizar No =0 y Si=1
dataN['RainToday'] = camTextBi(dataN, 'RainToday', 'No', 0, 1)
revCategoricos(dataN, 'RainToday')

0    113580
1     31880
Name: RainToday, dtype: int64

In [30]:
##Imputación de variables predictora
dataN['RainTomorrow'] = imputacionCatNa(dataN, 'RainTomorrow')
dataN['RainTomorrow'] = camTextBi(dataN, 'RainTomorrow', 'No', 0, 1)

revCategoricos(dataN, 'RainTomorrow')

0    113583
1     31877
Name: RainTomorrow, dtype: int64

In [36]:
dataN.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 71 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Date              145460 non-null  object 
 1   MinTemp           145460 non-null  float64
 2   MaxTemp           145460 non-null  float64
 3   Rainfall          145460 non-null  float64
 4   Evaporation       145460 non-null  float64
 5   Sunshine          145460 non-null  float64
 6   WindGustDir       145460 non-null  int64  
 7   WindGustSpeed     145460 non-null  float64
 8   WindDir9am        145460 non-null  int64  
 9   WindDir3pm        145460 non-null  int64  
 10  WindSpeed9am      145460 non-null  float64
 11  WindSpeed3pm      145460 non-null  float64
 12  Humidity9am       145460 non-null  float64
 13  Humidity3pm       145460 non-null  float64
 14  Pressure9am       145460 non-null  float64
 15  Pressure3pm       145460 non-null  float64
 16  Cloud9am          14