<a href="https://colab.research.google.com/github/Thiagothims/ml_wine/blob/main/wine_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# Importação das biblitecas
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [31]:
# Carregar o arquivo que será utilizado
data = pd.read_csv('/content/sample_data/winemag-data_first150k.csv')

# Visualizar as primeiras linhas dos dados
print(data.head(3))

   Unnamed: 0 country                                        description  \
0           0      US  This tremendous 100% varietal wine hails from ...   
1           1   Spain  Ripe aromas of fig, blackberry and cassis are ...   
2           2      US  Mac Watson honors the memory of a wine once ma...   

                            designation  points  price        province  \
0                     Martha's Vineyard      96  235.0      California   
1  Carodorum Selección Especial Reserva      96  110.0  Northern Spain   
2         Special Selected Late Harvest      96   90.0      California   

         region_1 region_2             variety                   winery  
0     Napa Valley     Napa  Cabernet Sauvignon                    Heitz  
1            Toro      NaN       Tinta de Toro  Bodega Carmen Rodríguez  
2  Knights Valley   Sonoma     Sauvignon Blanc                 Macauley  


In [32]:
# Visualizar o tamanho da matriz
print(data.shape)

(150930, 11)


In [33]:
# Visualizar as informações básicas sobre os dados
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   150930 non-null  int64  
 1   country      150925 non-null  object 
 2   description  150930 non-null  object 
 3   designation  105195 non-null  object 
 4   points       150930 non-null  int64  
 5   price        137235 non-null  float64
 6   province     150925 non-null  object 
 7   region_1     125870 non-null  object 
 8   region_2     60953 non-null   object 
 9   variety      150930 non-null  object 
 10  winery       150930 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 12.7+ MB
None


In [34]:
# excluir coluna que não é relevante
data = data.drop(['Unnamed: 0', 'description', 'designation'], axis=1)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   country   150925 non-null  object 
 1   points    150930 non-null  int64  
 2   price     137235 non-null  float64
 3   province  150925 non-null  object 
 4   region_1  125870 non-null  object 
 5   region_2  60953 non-null   object 
 6   variety   150930 non-null  object 
 7   winery    150930 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 9.2+ MB
None


In [35]:
# Visualizar estatísticas descritivas dos dados
print(data.describe())

              points          price
count  150930.000000  137235.000000
mean       87.888418      33.131482
std         3.222392      36.322536
min        80.000000       4.000000
25%        86.000000      16.000000
50%        88.000000      24.000000
75%        90.000000      40.000000
max       100.000000    2300.000000


In [36]:
# Valores nulos por coluna
print(data.isnull().sum())

country         5
points          0
price       13695
province        5
region_1    25060
region_2    89977
variety         0
winery          0
dtype: int64


In [43]:
# Realizar exclusão de todas as linhas que contenham valores nulos
data_cleaned = data.dropna()
excluded = len(data) - len(data_cleaned)

print('Quantidade de dados excluidos: ' + str(excluded))
print('Quantidade de dados restantes: ' + str(len(data_cleaned)))

Quantidade de dados excluidos: 90226
Quantidade de dados restantes: 60704
Total: 150930


In [46]:
print(data_cleaned.head())

  country  points  price    province            region_1           region_2  \
0      US      96  235.0  California         Napa Valley               Napa   
2      US      96   90.0  California      Knights Valley             Sonoma   
3      US      96   65.0      Oregon   Willamette Valley  Willamette Valley   
8      US      95   65.0      Oregon  Chehalem Mountains  Willamette Valley   
9      US      95   60.0  California        Sonoma Coast             Sonoma   

              variety     winery  
0  Cabernet Sauvignon      Heitz  
2     Sauvignon Blanc   Macauley  
3          Pinot Noir      Ponzi  
8          Pinot Noir  Bergström  
9          Pinot Noir  Blue Farm  


In [47]:
print(data_cleaned.shape)

(60704, 8)


In [52]:
# Selecionar colunas relevantes para realizar escalonamento

numeric_columns = ['points', 'price']

scaler = StandardScaler()

data_scaled = data_cleaned.copy()

print("Dados não escalonados:")
print(data_scaled[numeric_columns].head())
print(data_scaled.shape[0])

# Escalonar as colunas selecionadas
data_scaled[numeric_columns] = scaler.fit_transform(data_cleaned[numeric_columns])

print("\nDados escalonados:")
print(data_scaled[numeric_columns].head())
print(data_scaled.shape[0])


Dados não escalonados:
   points  price
0      96  235.0
2      96   90.0
3      96   65.0
8      95   65.0
9      95   60.0
60704

Dados escalonados:
     points     price
0  2.392941  8.067597
2  2.392941  2.255638
3  2.392941  1.253576
8  2.098916  1.253576
9  2.098916  1.053164
60704


In [53]:
print(data_scaled.head())

  country    points     price    province            region_1  \
0      US  2.392941  8.067597  California         Napa Valley   
2      US  2.392941  2.255638  California      Knights Valley   
3      US  2.392941  1.253576      Oregon   Willamette Valley   
8      US  2.098916  1.253576      Oregon  Chehalem Mountains   
9      US  2.098916  1.053164  California        Sonoma Coast   

            region_2             variety     winery  
0               Napa  Cabernet Sauvignon      Heitz  
2             Sonoma     Sauvignon Blanc   Macauley  
3  Willamette Valley          Pinot Noir      Ponzi  
8  Willamette Valley          Pinot Noir  Bergström  
9             Sonoma          Pinot Noir  Blue Farm  


In [54]:
# Categorização One-Hot
categorical_columns = ['country', 'province', 'region_1', 'region_2', 'variety', 'winery']

data_encoded = pd.get_dummies(data_scaled, columns=categorical_columns)

print("\nDados codificados:")
print(data_encoded.head())
print(data_encoded.shape)


Dados codificados:
     points     price  country_US  province_California  province_New York  \
0  2.392941  8.067597        True                 True              False   
2  2.392941  2.255638        True                 True              False   
3  2.392941  1.253576        True                False              False   
8  2.098916  1.253576        True                False              False   
9  2.098916  1.053164        True                 True              False   

   province_Oregon  province_Washington  region_1_Adelaida District  \
0            False                False                       False   
2            False                False                       False   
3             True                False                       False   
8             True                False                       False   
9            False                False                       False   

   region_1_Alexander Valley  region_1_Alta Mesa  ...  winery_l'homme qui ris  \
0        