In [3]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from datasets import load_dataset

In [6]:
ds = load_dataset("katossky/wine-recognition")
data = ds['train']

df = pd.DataFrame(data)
print(df.head())

   label  alcohol  malic acid   ash  alcalinity of ash  magnesium  \
0      1    14.23        1.71  2.43               15.6        127   
1      1    13.20        1.78  2.14               11.2        100   
2      1    13.16        2.36  2.67               18.6        101   
3      1    14.37        1.95  2.50               16.8        113   
4      1    13.24        2.59  2.87               21.0        118   

   total phenols  flavanoids  nonflavanoid phenols  proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   color intensity   hue  OD280/OD315 of diluted wines  proline  
0             5.64  1.04                          3.92     1065  
1             4.38  1.05  

In [8]:
X = df.drop(columns=["label"])
y = df["label"]

In [23]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [26]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

In [27]:
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [29]:
import joblib
joblib.dump(rf_model, "random_forest_model.pkl")
print("Modelo guardado como random_forest_model.pkl")

Modelo guardado como random_forest_model.pkl


In [30]:
# Cargar el modelo guardado
loaded_model = joblib.load("random_forest_model.pkl")
print("Modelo cargado exitosamente")

# Realizar predicciones en nuevos datos (ejemplo con X_test)
new_predictions = loaded_model.predict(X_test)
print("Predicciones en el conjunto de prueba:")
print(new_predictions)

Modelo cargado exitosamente
Predicciones en el conjunto de prueba:
[1 3 1 2 2 1 1 2 2 3 2 3 1 3 1 2 2 1 2 1 2 2 1 1 2 2 1 3 2 3 1 3 2 3 3 3]


In [33]:
# Ejemplo de prueba con nuevos datos
new_data = [[11.8, 3.0, 2.30, 21.0, 100.0, 2.7, 2.8, 0.32, 1.75, 4.9, 1.03, 2.65, 750]]
new_data_scaled = scaler.transform(new_data)
new_prediction = loaded_model.predict(new_data_scaled)
print("Predicción para los nuevos datos:", new_prediction)

Predicción para los nuevos datos: [1]




In [37]:
new_data_example1 = [[12.72, 1.81, 2.2, 18.8, 86.0, 2.2, 2.53, 0.26, 1.77, 3.9, 1.16, 3.14, 714.0]]
new_data_scaled1 = scaler.transform(new_data_example1)
new_prediction1 = loaded_model.predict(new_data_scaled1)
print("Predicción para los nuevos datos:", new_prediction1)

Predicción para los nuevos datos: [2]




In [39]:
new_data_example1 = [[12.45, 3.03, 2.64, 27.0, 97.0, 1.9, 0.58, 0.63, 1.14, 7.5, 0.67, 1.73, 880.0]]
new_data_scaled1 = scaler.transform(new_data_example1)
new_prediction1 = loaded_model.predict(new_data_scaled1)
print("Predicción para los nuevos datos:", new_prediction1)

Predicción para los nuevos datos: [3]




In [42]:
from google.colab import files
files.download("random_forest_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>