In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
import matplotlib.pyplot as plt


In [35]:
#📌 1. Importar os dados

import pandas as pd

# Carregar o CSV
df = pd.read_csv('winequality-red.csv')

# Ver as primeiras linhas
print(df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [36]:
#📌 2. Verificar informações e dados ausentes

# Informações básicas
print(df.info())

print()

# Verificar valores ausentes
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB
None

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides       

In [37]:
#📌 3. Estatísticas descritivas

print(df.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [38]:

# Aplicando Label Encoding (caso existam colunas categóricas)
colunas_categoricas = df.select_dtypes(include=['object']).columns

if len(colunas_categoricas) > 0:
    le = LabelEncoder()
    for coluna in colunas_categoricas:
        df[coluna] = le.fit_transform(df[coluna].astype(str))
    print("Label Encoding aplicado nas colunas:", list(colunas_categoricas))
else:
    print("Nenhuma coluna categórica encontrada.")
        

Nenhuma coluna categórica encontrada.


In [39]:
#📌 4. Tratar os dados (normalização, remoção de outliers etc.)

from sklearn.preprocessing import StandardScaler

# Separar variáveis independentes (X) e a dependente (y)
X = df.drop('density', axis=1)
y = df['density']

# Normalizar os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

# 4. Dividir os dados
x_treino, x_teste, y_treino, y_teste = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# 5. Criar o modelo
modelo = LinearRegression()

# 6. K-Fold com semente
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# 7. Scorers
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

# 8. Cross-validation
mse_scores = cross_val_score(modelo, x_treino, y_treino, cv=kfold, scoring=mse_scorer)
mae_scores = cross_val_score(modelo, x_treino, y_treino, cv=kfold, scoring=mae_scorer)
r2_scores = cross_val_score(modelo, x_treino, y_treino, cv=kfold, scoring=r2_scorer)

# 9. Resultados
print("MSE por fold:", -mse_scores)
print("MSE médio:", -np.mean(mse_scores))
print()

print("MAE por fold:", -mae_scores)
print("MAE médio:", -np.mean(mae_scores))
print()

print("R² por fold:", r2_scores)
print("R² médio:", np.mean(r2_scores))
print()

rmse_scores = np.sqrt(-mse_scores)
print("RMSE por fold:", rmse_scores)
print("RMSE médio:", np.mean(rmse_scores))


MSE por fold: [6.31448431e-07 6.58223741e-07 5.04672331e-07 5.61817996e-07
 5.93189583e-07 6.71364078e-07 4.92138195e-07 5.85975606e-07
 5.80806916e-07 6.60921316e-07]
MSE médio: 5.940558194020038e-07

MAE por fold: [0.00057283 0.00057652 0.00052119 0.0005689  0.00059457 0.0006086
 0.00054954 0.00055425 0.00054853 0.000621  ]
MAE médio: 0.0005715938331347067

R² por fold: [0.77866311 0.80827303 0.78688738 0.8266156  0.84781758 0.86484313
 0.77607892 0.77126227 0.85371966 0.83914236]
R² médio: 0.8153303037596116

RMSE por fold: [0.00079464 0.00081131 0.0007104  0.00074955 0.00077019 0.00081937
 0.00070153 0.00076549 0.00076211 0.00081297]
RMSE médio: 0.0007697545052044463


In [42]:
# Transformar x_treino em DataFrame para manter os nomes das colunas
x_treino_df = pd.DataFrame(x_treino, columns=X.columns)
x_teste_df = pd.DataFrame(x_teste, columns=X.columns)

# Treinar o modelo
modelo.fit(x_treino, y_treino)
y_pred = modelo.predict(x_teste)

print("\n# Avaliação Final no Conjunto de Teste")
# Exibir os coeficientes com nomes das colunas
coeficientes = pd.Series(modelo.coef_, index=X.columns)
print("Coeficientes do modelo:\n", coeficientes)
print()
print("Coeficiente linear (intercepto):", modelo.intercept_)
print("Erro quadrático médio (MSE):", mean_squared_error(y_teste, y_pred))
print("Coeficiente de determinação (R²):", r2_score(y_teste, y_pred))



# Avaliação Final no Conjunto de Teste
Coeficientes do modelo:
 fixed acidity           0.001615
volatile acidity        0.000113
citric acid             0.000010
residual sugar          0.000601
chlorides               0.000064
free sulfur dioxide    -0.000097
total sulfur dioxide    0.000092
pH                      0.000776
sulphates               0.000228
alcohol                -0.000967
quality                -0.000018
dtype: float64

Coeficiente linear (intercepto): 0.9967523973271756
Erro quadrático médio (MSE): 5.234703249521754e-07
Coeficiente de determinação (R²): 0.8711142590349822
