In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import roc_curve, roc_auc_score





In [2]:
df1 = pd.read_csv('dados\\diabetes_numeric.csv') #REGRESSAO
df2 = pd.read_csv('dados\\bloodtransf.csv')      #CLASSIFICACAO
df3 = pd.read_csv('dados\\wine.csv')             #CLUSTERIZACAO


In [3]:
# verificando a questão 1

print("Quantidade de linhas: ",  df1.shape[0], "Quantidade de colunas: ",  df1.shape[1] )
print('-'*30)
df1.head()

Quantidade de linhas:  43 Quantidade de colunas:  3
------------------------------


Unnamed: 0,age,deficit,c_peptide
0,5.2,-8.1,4.8
1,8.8,-16.1,4.1
2,10.5,-0.9,5.2
3,10.6,-7.8,5.5
4,10.4,-29.0,5.0


In [4]:
# verificando a questão 2

print("Quantidade de linhas: ",  df2.shape[0], "Quantidade de colunas: ",  df2.shape[1] )
print('-'*30)
df2.head()

Quantidade de linhas:  748 Quantidade de colunas:  5
------------------------------


Unnamed: 0,V1,V2,V3,V4,Class
0,2,50,12500,98,2
1,0,13,3250,28,2
2,1,16,4000,35,2
3,2,20,5000,45,2
4,1,24,6000,77,1


In [5]:
# verificando a questão 3

print("Quantidade de linhas: ", df3.shape[0], "Quantidade de colunas: ",  df3.shape[1] )
print('-'*30)


# Quantidade de classes
size = len(df3['class'].unique())
print(f'Quantidade de classes = {size}')

# verificando as linhas e colunas
df3.head()


Quantidade de linhas:  178 Quantidade de colunas:  14
------------------------------
Quantidade de classes = 3


Unnamed: 0,class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280%2FOD315_of_diluted_wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [6]:
# Verificando a questão 4

print("dados faltantes df1: \n", df1.isnull().sum())
print("--"*30)

print("dados faltantes df2: \n", df2.isnull().sum())
print("--"*30)

print("dados faltantes df3: \n", df3.isnull().sum())
print("--"*30)


dados faltantes df1: 
 age          0
deficit      0
c_peptide    0
dtype: int64
------------------------------------------------------------
dados faltantes df2: 
 V1       0
V2       0
V3       0
V4       0
Class    0
dtype: int64
------------------------------------------------------------
dados faltantes df3: 
 class                             0
Alcohol                           0
Malic_acid                        0
Ash                               0
Alcalinity_of_ash                 0
Magnesium                         0
Total_phenols                     0
Flavanoids                        0
Nonflavanoid_phenols              0
Proanthocyanins                   0
Color_intensity                   0
Hue                               0
OD280%2FOD315_of_diluted_wines    0
Proline                           0
dtype: int64
------------------------------------------------------------


## Regressão Linear

In [7]:
X_df1 = df1.drop('c_peptide', axis=1)
y_df1 = df1['c_peptide']


In [8]:
sc = StandardScaler()
X = sc.fit_transform(X_df1)

In [9]:
# separa treino e teste
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, 
                                                    y_df1, 
                                                    test_size=0.37, 
                                                    random_state=5762)


In [10]:
# Verificando a questão 5
reg_lin = LinearRegression()
reg_lin.fit(X_train1,y_train1)
df1_y_pred = reg_lin.predict(X_test1)


In [11]:
# Verificando a questão 5
print(f'R2: {round(metrics.r2_score(y_test1, df1_y_pred),2)}')

# verificando a questão 6
print(f'MAE: {round(metrics.mean_absolute_error(y_test1, df1_y_pred),2)}')

# verificando a questão 7
print(f'MSE: {round(metrics.mean_squared_error(y_test1, df1_y_pred),2)}')

R2: 0.02
MAE: 0.53
MSE: 0.44


## Classificação

In [12]:
df2['Class'].unique()

array([2, 1], dtype=int64)

In [13]:
name_to_class = {1: 0, 
                 2: 1}

#substituindo os valores categóricos pelo mapeamento
df2['Class'] = df2['Class'].map(name_to_class)

In [14]:
df2['Class'].unique()

array([1, 0], dtype=int64)

In [15]:
y_df2 = np.array(df2['Class'])


In [16]:
# removendo a coluna de labels do df original
X_df2 = df2.drop('Class', axis=1)

# check
X_df2.columns

Index(['V1', 'V2', 'V3', 'V4'], dtype='object')

In [17]:
# convertendo df para array
X_df2 = np.array(X_df2)

In [18]:
#sc = StandardScaler()
#X = sc.fit_transform(X_df2)

In [19]:
# separa treino e teste
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_df2, 
                                                    y_df2, 
                                                    test_size=0.37, 
                                                    random_state=5762)
clf = SVC(kernel ='rbf')
clf.fit(X_train2, y_train2)

df2_y_pred = clf.predict(X_test2)

In [20]:
round(metrics.accuracy_score(y_test2, df2_y_pred),2)

0.79

In [21]:
print(metrics.classification_report(y_test2, df2_y_pred))
print(metrics.confusion_matrix(y_test2, df2_y_pred))

              precision    recall  f1-score   support

           0       0.80      0.98      0.88       218
           1       0.50      0.07      0.12        59

    accuracy                           0.79       277
   macro avg       0.65      0.52      0.50       277
weighted avg       0.73      0.79      0.72       277

[[214   4]
 [ 55   4]]


In [22]:
print('Matriz de Confusão\n', metrics.confusion_matrix(y_test2, df2_y_pred)) 
print('\nAcurácia\n', metrics.accuracy_score(y_test2, df2_y_pred)) 
print('\nAcurácia Balanceada por classe\n', metrics.balanced_accuracy_score(y_test2, df2_y_pred)) 
print('\nPrecision\n', metrics.precision_score(y_test2, df2_y_pred)) 
print('\nRecall\n', metrics.recall_score(y_test2, df2_y_pred)) 
print('\nF1\n', metrics.f1_score(y_test2, df2_y_pred)) 



Matriz de Confusão
 [[214   4]
 [ 55   4]]

Acurácia
 0.7870036101083032

Acurácia Balanceada por classe
 0.5247239931581402

Precision
 0.5

Recall
 0.06779661016949153

F1
 0.11940298507462686


In [23]:
baseline_preds = np.random.choice([0,1], size = len(y_test2))

print(baseline_preds)

[1 1 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1
 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 0
 1 1 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 1 0 0 1 1
 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1
 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 1 0 1
 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 1
 0 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 1 0]


In [24]:
## Verificando a questão 11 

print('\Baseline nAUCROC\n', round(metrics.roc_auc_score(y_test2, baseline_preds),2))
print('\nAUCROC\n', round(metrics.roc_auc_score(y_test2, df2_y_pred),2))


\Baseline nAUCROC
 0.57

AUCROC
 0.52


## Cluster

In [25]:
# verificando a questão 12

# Quantidade de classes
size = len(df3['class'].unique())
print(f'Quantidade de classes = {size}')

Quantidade de classes = 3


In [26]:
name_to_class = {1: 0, 2: 1, 3: 2}

#substituindo os valores categóricos pelo mapeamento
df3['class'] = df3['class'].map(name_to_class)

In [27]:
# armazenando os labels em um array
y_df3 = np.array(df3['class'])

# salvando a ordem das features
feature_list = list(df3.columns)

In [28]:
# removendo a coluna de labels do df original
X_df3 = df3.drop('class', axis = 1)

# check
X_df3.columns

Index(['Alcohol', 'Malic_acid', 'Ash', 'Alcalinity_of_ash', 'Magnesium',
       'Total_phenols', 'Flavanoids', 'Nonflavanoid_phenols',
       'Proanthocyanins', 'Color_intensity', 'Hue',
       'OD280%2FOD315_of_diluted_wines', 'Proline'],
      dtype='object')

In [29]:
data = np.array(df3)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_df3, 
                                                    y_df3, 
                                                    test_size=0.37,
                                                    random_state = 5762)

In [31]:
clustering = KMeans(n_clusters = 3, random_state = 5762)
 
# treinando o modelo no conjunto de dados de treino
clustering.fit(X_train);



In [32]:
# aplicando o modelo treinado para a previsão da temperatura 
#em todo o conjunto de teste
df3_y_pred = clustering.predict(X_test)

In [33]:
#avaliando o modelo

# Verificando a questão 13
print('Coeficiente de Silhueta\n', round(metrics.silhouette_score(X_test, 
                                                            df3_y_pred),2)) 

# Verificando a questão 14
print('\nDavies-Bouldin Score\n', round(metrics.davies_bouldin_score(X_test, 
                                                               df3_y_pred),2))

print('\nMatriz de Contingência\n', metrics.cluster.contingency_matrix(y_test, 
                                                                       df3_y_pred)) 

# Verificando a questão 15
print('\nMutual information\n', round(metrics.mutual_info_score(y_test, 
                                                          df3_y_pred),2)) 

Coeficiente de Silhueta
 0.6

Davies-Bouldin Score
 0.5

Matriz de Contingência
 [[ 0 12  5]
 [23  0  7]
 [ 4  0 15]]

Mutual information
 0.49
