In [554]:
import pydot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Dataframe:

### Seaborn Iris:

In [555]:
df_iris = sns.load_dataset("iris")

In [556]:
display(df_iris.head())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Golf:

In [557]:
path = r'../../../../data/golf_df.xlsx'
df_golf = pd.read_excel(
    io=path
)

In [558]:
display(df_golf.head())

Unnamed: 0,clima,temp,umidade,vento,jogar
0,sunny,hot,high,no,no
1,sunny,hot,high,yes,no
2,overcast,hot,high,no,yes
3,rainy,mild,high,no,yes
4,rainy,cool,normal,no,yes


## Código:

### Data Understanding:

In [559]:
class DataUnderstanding():
    def verificando_as_dimensões_do_dataframe(self, dataframe):
        dataframe = dataframe.shape
        print(f'Linhas: {dataframe[0]} \nColunas: {dataframe[1]}')

    def descrição(self, dataframe):
        dataframe = dataframe.describe().round(decimals=2)
        display(dataframe)

    def verificando_tipos(self, dataframe):
        dataframe = (
            dataframe.dtypes
            .to_frame('Tipos')
        )
        display(dataframe)

    def verificando_valores_nulos(self, dataframe):
        valores_nulos = dataframe.isnull().sum()
        percent_missing = (valores_nulos / len(dataframe))
        dataframe = pd.DataFrame(
            {
            'Quantidade': valores_nulos,
            'Porcentagem': percent_missing
            }
        )
        dataframe = dataframe.style.format('{:.2%}', subset=['Porcentagem'])
        display(dataframe)
    
    def verificando_valores_duplicados(self, dataframe):
        valores_repetidos = dataframe.apply(lambda x: x.duplicated()).sum()
        percent_missing = (valores_repetidos / len(dataframe))
        dataframe = pd.DataFrame(
            {
                'Quantidade': valores_repetidos,
                'Porcentagem': percent_missing
            },
            index=dataframe.columns
        )

        dataframe = dataframe.style.format('{:.2%}', subset=['Porcentagem'])
        display(dataframe)

    def frequencia_da_repetição_dos_valores(self, dataframe, coluna):
        dataframe = dataframe[coluna].value_counts().reset_index()
        return dataframe

    def verificando_correlação(self, dataframe):
        dataframe = dataframe.corr()
        return dataframe

### Data Preparetion:

In [560]:
class DataPreparetion(DataUnderstanding):
    def removendo_nulos(self, dataframe):
        return dataframe.dropna()

    def substituindo_valores(self, dataframe, colunas, valores):      
        return dataframe[colunas].replace(valores)
    
    def convertendo_para_inteiro(self, dataframe, colunas):
        return dataframe[colunas].astype(int)

    def dummy(self, dataframe, colunas=[]):
        dataframe = pd.get_dummies(dataframe)#, prefix=colunas)
        return dataframe
    
    def label_endcode(self, dataframe):
        training = LabelEncoder().fit(dataframe)
        dataframe = training.transform(dataframe)
        return dataframe
    
    def normalizando_os_dados(self, dataframe):
        return scale(dataframe)

### Data Visualization:

In [561]:
class DataVisualization(DataPreparetion):
    def gráfico_de_barras(self, dataframe, x, y):
        sns.barplot(
            data=dataframe,
            x=x,
            y=y
        )
        plt.show()

    def gráfico_de_linha_desempenho_do_modelo(self, y_true, y_pred):
        index = [i for i in range(1, len(y_pred)+1, 1)]

        fig = plt.figure(figsize=(12, 8))
        plt.plot(index, y_true, color='blue')
        plt.plot(index, y_pred, color='red')
        plt.xlabel('index')
        plt.ylabel('valores')
        plt.show()

    def gráfico_de_regressão_lmplot(self, dataframe, x, y, titulo):
        sns.lmplot(
            data=dataframe, 
            x=x,
            y=y
        )
        ax = plt.gca()
        ax.set_title(titulo)
        plt.show()

    def gráfico_de_correlação_pairplot(self, dataframe):
        sns.pairplot(dataframe)
        plt.show()

    def gráfico_de_correlação_heatmap(self, dataframe):
        sns.set(style='white')
        corr = dataframe.corr()
        plt.figure(figsize=(16, 10))
        sns.heatmap(corr, annot=True, cmap='RdBu_r', fmt='.2f', annot_kws={'size': 12})
        plt.show()

    def gráfico_outliers_boxplot(self, dataframe, figsize):
        dataframe = dataframe.select_dtypes(include='number')
        dataframe = dataframe.apply(scale)
        fig = plt.figure(figsize=figsize)
        sns.boxplot(data=dataframe)
        plt.show()

    def gráfico_de_distribuição_histplot(self, dataframe, coluna):
        sns.histplot(data=dataframe[coluna])

        plt.show()

# Regressão Logística:

## Código:

In [562]:
class RegressãoLogística(DataVisualization):
    def training(self, dataframe, x=[], y=[]):
        x_train, x_test, y_train, y_test = train_test_split(
            dataframe[x], 
            dataframe[y], 
            test_size=0.2, 
            random_state=42
        )

        training = (
            LogisticRegression().fit(x_train, y_train)
        )

        y_pred = training.predict(x_test)

        return training, y_pred
    
    def predictions(self, training, input):
        return training.predict(input)
        

if __name__ == '__main__':
    regressão_logística = RegressãoLogística()

## Output:

### Data Understanding:

In [563]:
display(df_iris.head())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [564]:
df_iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

### Modelo:

In [565]:
x = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
y = ['species']

In [566]:
training, y_pred = regressão_logística.training(
    dataframe=df_iris,
    x=x,
    y=y
)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [567]:
df_iris['predictions'] = regressão_logística.predictions(
    training=training,
    input=df_iris[x]
)

In [568]:
display(df_iris[df_iris['species'] != df_iris['predictions']])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,predictions
70,5.9,3.2,4.8,1.8,versicolor,virginica
77,6.7,3.0,5.0,1.7,versicolor,virginica
83,6.0,2.7,5.1,1.6,versicolor,virginica


## Exercise:

### Código:

In [569]:
class exercise(DataVisualization):
    def training(self, dataframe, x=[], y=[]):
        x_train, x_test, y_train, y_test = train_test_split(
            dataframe[x], 
            dataframe[y], 
            test_size=0.2, 
            random_state=42
        )

        training = (
            LogisticRegression().fit(x_train, y_train)
        )

        y_pred = training.predict(x_test)

        return training, y_pred
    
    def predictions(self, training, input):
        return training.predict(input)
        

if __name__ == '__main__':
    exercise = exercise()

### Output:

### Data Understanding:

In [570]:
display(df_golf.head())

Unnamed: 0,clima,temp,umidade,vento,jogar
0,sunny,hot,high,no,no
1,sunny,hot,high,yes,no
2,overcast,hot,high,no,yes
3,rainy,mild,high,no,yes
4,rainy,cool,normal,no,yes


In [571]:
df_golf.columns

Index(['clima', 'temp', 'umidade', 'vento', 'jogar'], dtype='object')

In [572]:
for coluna in df_golf.columns:
    print(f'{coluna}:')
    display(exercise.frequencia_da_repetição_dos_valores(
        dataframe=df_golf,
        coluna=coluna
    ))

clima:


Unnamed: 0,index,clima
0,sunny,5
1,rainy,5
2,overcast,4


temp:


Unnamed: 0,index,temp
0,mild,6
1,hot,4
2,cool,4


umidade:


Unnamed: 0,index,umidade
0,high,7
1,normal,7


vento:


Unnamed: 0,index,vento
0,no,8
1,yes,6


jogar:


Unnamed: 0,index,jogar
0,yes,9
1,no,5


### Data Preparetion:

In [573]:
colunas = ['clima', 'temp', 'umidade', 'vento']

In [574]:
for coluna in colunas:
    df_golf[coluna] = exercise.label_endcode(
        dataframe=df_golf[coluna],
    )

### Modelo:

In [575]:
df_golf.columns

Index(['clima', 'temp', 'umidade', 'vento', 'jogar'], dtype='object')

In [576]:
x = ['clima', 'temp', 'umidade', 'vento']
y = ['jogar']

In [577]:
training, y_pred = exercise.training(
    dataframe=df_golf,
    x=x,
    y=y
)

  y = column_or_1d(y, warn=True)


In [578]:
df_golf['predictions'] = exercise.predictions(
    training=training,
    input=df_golf[x]
)

In [579]:
display(df_golf.head())

Unnamed: 0,clima,temp,umidade,vento,jogar,predictions
0,2,1,0,0,no,yes
1,2,1,0,1,no,no
2,0,1,0,0,yes,yes
3,1,2,0,0,yes,yes
4,1,0,1,0,yes,yes


In [580]:
display(df_golf[df_golf['jogar'] != df_golf['predictions']])

Unnamed: 0,clima,temp,umidade,vento,jogar,predictions
0,2,1,0,0,no,yes
5,1,0,1,1,no,yes
10,2,2,1,1,yes,no
