In [3]:
! pip install xgboost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import sqlite3
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb


Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/75/dd/9afe0d9d0f61a5384c3932626a022e38c396a5d88e6f5345ad2f7b576747/xgboost-1.7.6-py3-none-win_amd64.whl.metadata
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl.metadata (1.9 kB)
Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
   ---------------------------------------- 70.9/70.9 MB 9.0 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6


In [6]:
# Load data from SQLite database
query = 'SELECT * FROM rose_wine'
conn = sqlite3.connect('wine_database.db')
df = pd.read_sql_query(query, conn)
conn.close()

columns_to_decode = ['Name', 'Region', 'Winery']
for col in columns_to_decode:
    df[col] = df[col].str.encode('iso-8859-1').str.decode('utf-8')
   
df = df.drop(columns=['Name'])

# Separate categorical and numerical features
categorical_cols = ['Country', 'Region', 'Winery']
numerical_cols = ['Rating', 'Year']

X = df[categorical_cols + numerical_cols]
y = df['Price']
X


Unnamed: 0,Country,Region,Winery,Rating,Year
0,France,Méditerranée,Château Camparnaud,4.0,2019
1,Portugal,Douro,Niepoort,3.9,2019
2,Italy,Abruzzo,Farnese,3.6,2019
3,Israel,Galilee,Hermon,3.4,2017
4,France,Languedoc,Château La Sauvageonne,4.6,2018
...,...,...,...,...,...
389,France,Coteaux d'Aix-en-Provence,AIX,4.0,2019
390,Italy,Lombardia,Cà dei Frati,4.1,2019
391,France,Languedoc,Gérard Bertrand,3.9,2019
392,France,Côtes de Provence,Minuty,4.0,2019


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Combine training and test data to fit the encoder
X_combined = pd.concat([X_train, X_test], ignore_index=True)

# Create transformers for preprocessing
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer = StandardScaler()

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, categorical_cols),
        ('num', num_transformer, numerical_cols)
    ]
)

# Fit and transform the combined dataset
X_combined_preprocessed = preprocessor.fit_transform(X_combined)

# Split the preprocessed data back into training and test datasets
X_train_preprocessed = X_combined_preprocessed[:len(X_train)]
X_test_preprocessed = X_combined_preprocessed[len(X_train):]

# Create XGBoost model
xgb_model = xgb.XGBRegressor()

# Fit the grid search to the preprocessed training data
xgb_model.fit(X_train_preprocessed, y_train)

# Make predictions on the preprocessed test data
predictions = xgb_model.predict(X_test_preprocessed)

# Calculate R-squared and Mean Squared Error
r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

print(f"R-squared: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")

R-squared: -2.5895
Mean Squared Error: 493.3960
