# Inicialização dataset - Task [001]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ftfy import fix_text
from sklearn.preprocessing import MinMaxScaler
from category_encoders import BinaryEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# open the csv file with pandas and read it into a dataframe
df = pd.read_csv('../datasets/WineDataset.csv')
df2 = pd.read_csv('../datasets/XWines_Full_100K_wines.csv')
df3 = pd.read_csv('../datasets/merged_wine_dataset.csv')

In [None]:
# print all the information about the dataframe
df.info()
df.describe()

In [None]:
# print all the information about the dataframe
df2.info()
df2.describe()

In [None]:
# print all the information about the dataframe
df3.info()
df3.describe()

In [7]:
# fix text using ftfy
# fix all the columns except when the column is a float
for col in df.columns:
    if df[col].dtype == 'object': 
        df[col] = df[col].apply(lambda x: fix_text(x) if isinstance(x, str) else x)

# fix text using ftfy
# fix all the columns except when the column is a float
for col in df2.columns:
    if df2[col].dtype == 'object': 
        df2[col] = df2[col].apply(lambda x: fix_text(x) if isinstance(x, str) else x)

In [None]:
# view all the unique values for each column except for the columns:'Title' 'Description', 'Country', 'Unit' 'Region' 'Appellation'
for col in df.columns:
    if col not in ['Title', 'Description', 'Country', 'Unit', 'Region', 'Appellation']:
        print(col, df[col].unique())

In [None]:
for col in df3.columns:
    if col not in ['WineName', 'WineryName', 'Grape', 'Secondary Grape Varieties', 'Country', 'Region', 'Appellation', 'Style', 'Characteristics', 'Description']:
        print(col, df3[col].unique())

# Limpeza de dados
- Remoção de valores nulos
- Normalização de valores
- Remoção de colunas desnecessárias

In [None]:
# Print wine where wineid 131027
print(df2.loc[df2['WineID'] == 131027])

#One-Hot Encoding for the column 'Type' 'Elaborate' 'Body', 'Acidity'

one_hot_encoded_df = pd.get_dummies(df2, columns=['Type','Body','Acidity','Elaborate'], prefix=['Type','Body','Acidity','Elaborate'])

# Binary Encoding for the column 'Country', 'RegionName', 'Grapes', 'Harmonize'

encoder = BinaryEncoder(cols=['Grapes', 'Harmonize'], return_df=True)
binary_encoded_df = encoder.fit_transform(df2)

encoded_df = pd.concat([one_hot_encoded_df, binary_encoded_df], axis=1)

# Normalize the 'ABV' column using MinMaxScaler
scaler = MinMaxScaler()

# Normalizing the 'ABV' column
encoded_df['ABV'] = scaler.fit_transform(encoded_df[['ABV']])


encoded_df = encoded_df.drop(columns=df2.columns)	
encoded_df = encoded_df.dropna()

# Display the updated DataFrame
print(encoded_df)


# Classification Dataset Preprocessing - Task [066]
### Normalizing Harmonize and grouping it into broader categories

In [11]:
# Replace dishes with their broader categories in the Harmonize column
df2['Harmonize'] = df2['Harmonize'].apply(eval)
dish_to_category = {
    'Beef': 'Meat', 'Lamb': 'Meat', 'Pork': 'Meat', 'Veal': 'Meat', 'Game Meat': 'Meat',
    'Duck': 'Meat', 'Ham': 'Meat', 'Cold Cuts': 'Meat', 'Cured Meat': 'Meat',
    'Poultry': 'Poultry', 'Chicken': 'Poultry',
    'Rich Fish': 'Fish & Seafood', 'Lean Fish': 'Fish & Seafood', 'Shellfish': 'Fish & Seafood',
    'Seafood': 'Fish & Seafood', 'Sushi': 'Fish & Seafood', 'Sashimi': 'Fish & Seafood',
    'Codfish': 'Fish & Seafood', 'Fish': 'Fish & Seafood', 'Grilled': 'Fish & Seafood',
    'Soft Cheese': 'Cheese', 'Hard Cheese': 'Cheese', 'Blue Cheese': 'Cheese',
    'Maturated Cheese': 'Cheese', 'Goat Cheese': 'Cheese', 'Mild Cheese': 'Cheese',
    'Medium-cured Cheese': 'Cheese', 'Cheese': 'Cheese',
    'Pasta': 'Pasta', 'Tagliatelle': 'Pasta', 'Lasagna': 'Pasta',
    'Paella': 'Fish & Seafood', 'Pizza' : 'Pasta',
    'Vegetarian': 'Vegetarian & Vegan', 'Mushrooms': 'Vegetarian & Vegan', 'Salad': 'Vegetarian & Vegan',
    'Fruit': 'Vegetarian & Vegan', 'Tomato Dishes': 'Vegetarian & Vegan', 'Beans': 'Vegetarian & Vegan',
    'Eggplant Parmigiana': 'Vegetarian & Vegan', 'Light Stews': 'Vegetarian & Vegan',
    'Appetizer': 'Appetizers & Snacks', 'Snack': 'Appetizers & Snacks',
    'Aperitif': 'Appetizers & Snacks', 'French Fries': 'Appetizers & Snacks', 'Baked Potato': 'Appetizers & Snacks',
    'Cream': 'Appetizers & Snacks',
    'Sweet Dessert': 'Desserts', 'Fruit Dessert': 'Desserts', 'Citric Dessert': 'Desserts',
    'Cake': 'Desserts', 'Chocolate': 'Desserts', 'Cookies': 'Desserts',
    'Chestnut': 'Desserts', 'Spiced Fruit Cake': 'Desserts', 'Dessert': 'Desserts',
    'Soufflé': 'Desserts', 'Dried Fruits': 'Desserts',
    'Spicy Food': 'Spicy Food', 'Curry Chicken': 'Spicy Food', 'Asian Food': 'Spicy Food', 'Yakissoba': 'Spicy Food',
    'Barbecue': 'Meat', 'Roast': 'Meat'
}

columns = df2.columns
new_harmonize = df2.copy()
new_harmonize['Harmonize'] = df2['Harmonize'].apply(lambda x: list(set(dish_to_category.get(dish, dish) for dish in x)))

# Expand the Harmonize column into multiple rows, one for each dish
new_harmonize = new_harmonize.explode('Harmonize')

# Reset the index for consistency and remove any rows with 'Risotto' because it has a number of insignificant occurrences
new_harmonize.reset_index(drop=True, inplace=True)
new_harmonize = new_harmonize[new_harmonize['Harmonize'].apply(lambda x: 'Risotto' not in x)]
df2 = new_harmonize.copy()

# Get the unique dishes
unique_categories = df2['Harmonize'].unique()

# One-hot encoding
for dish in unique_categories:
    new_harmonize[f'Harmonize_{dish}'] = new_harmonize['Harmonize'].apply(lambda x: x == dish)
# Turn new_harmonize into a harmonize dataframe
new_harmonize = new_harmonize.drop(columns=columns)


### Normalizing the rest of the columns

In [None]:
#One-Hot Encoding for the column 'Type' 'Elaborate' 'Body', 'Acidity'
one_hot_encoded_df = pd.get_dummies(df2, columns=['Type', 'Elaborate', 'Body', 'Acidity'])

# Binary Encoding for the column 'Grapes'
encoder = BinaryEncoder(cols=['Grapes'], return_df=True)
binary_encoded_df = encoder.fit_transform(df2)

encoded_df = one_hot_encoded_df.copy()

# Maximum ABV value
print(encoded_df['ABV'].min())
print(encoded_df['ABV'].max())

# get the row where the abv is min
print(encoded_df[encoded_df['ABV'] == encoded_df['ABV'].min()])


# Normalizing the 'ABV' column
scaler = MinMaxScaler()
encoded_df['ABV'] = scaler.fit_transform(encoded_df[['ABV']])


# Dropping unnecessary columns
encoded_df = encoded_df.drop(columns=['Country', 'RegionName', 'Code', 'WineName', 'WineID', 'Vintages', 'Website', 'WineryID', 'WineryName', 'RegionID','Grapes','Harmonize'])
encoded_df = encoded_df.dropna()
# add new_harmonize to the encoded_df
encoded_df = pd.concat([encoded_df, new_harmonize], axis=1)

### Boxplot

In [None]:
# Calculate the IQR
numeric_df = encoded_df['ABV']
q1 = numeric_df.quantile(0.25)
q3 = numeric_df.quantile(0.75)
iqr = q3 - q1

# Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=numeric_df, showmeans=True, orient="h")

Analyzing the resulting boxplot, we observe that some values fall outside the ranges but are still relevant, as 0% alcohol wines, including dealcoholized varieties, remain within the wine category. These options retain the essence of traditional wines without alcohol. Additionally, beverages like firewater, with higher alcohol content, can also bring value to the project.

### Correlation Matrix

In [None]:
# Compute the correlation matrix without the Elaborate columns to reduce the size of the heatmap
elaborate_columns = encoded_df.columns[encoded_df.columns.str.startswith('Elaborate_')]
matrix_df = encoded_df.drop(columns=elaborate_columns)
correlation_matrix = matrix_df.corr()


plt.figure(figsize=(20, 15))
sns.heatmap(
    correlation_matrix, 
    annot=True, 
    fmt=".2f", 
    cmap="coolwarm", 
    cbar=True, 
    annot_kws={"size": 10}
)
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)
plt.show()

Analyzing the matrix, we observe that it is relatively ‘cold,’ meaning it exhibits a low correlation index overall. Despite the generally low correlations between features, certain specific relationships do stand out. For instance, some dishes in the ‘Harmonize’ category show a notable correlation with particular wines, while certain body characteristics correlate with the alcohol by volume (ABV) and wine types.

### PCA

In [None]:
from sklearn.decomposition import PCA

harmonize_columns = [col for col in encoded_df.columns if col.startswith('Harmonize_')]

# Loop through each 'harmonize_' column and perform PCA
for label_col in harmonize_columns:
    print(f"Generating PCA plot for label column: {label_col}")

    features = encoded_df.drop(columns=[label_col])
    labels = encoded_df[label_col].values

    pca = PCA(n_components=2)
    df_pca = pca.fit_transform(features)

    plt.figure(figsize=(8, 6))
    unique_labels = np.unique(labels)
    for label in unique_labels:
        plt.scatter(
            df_pca[labels == label, 0], 
            df_pca[labels == label, 1], 
            label=f'Classe {label}'
        )

    plt.title(f'PCA - Column: {label_col}')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend()
    plt.grid()
    plt.show()

When applying PCA to identify patterns and relationships among the different categories in the ‘Harmonize’ column, we encountered inconclusive results. The resulting plots showed the various categories exhibiting similar patterns, with data points widely scattered across the graphical space rather than concentrated in specific zones.

# Wine Type Prediction using Random Forest Classifier: Model Performance & Results - Task [089]

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

# Step 1: Define feature and target columns
feature_columns = [col for col in encoded_df.columns if col.startswith('Harmonize_')]
target_columns = [col for col in encoded_df.columns if col.startswith('Type_')]

# Step 2: Split data into features (X) and target (y)
X = encoded_df[feature_columns]
y = encoded_df[target_columns]

# Step 3: Split data into training, validation, and test sets (80% train, 20% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Step 4: Encode target labels (multiclass classification)
y_train_labels = y_train.idxmax(axis=1).str.replace("Type_", "", regex=True)
y_val_labels = y_val.idxmax(axis=1).str.replace("Type_", "", regex=True)
y_test_labels = y_test.idxmax(axis=1).str.replace("Type_", "", regex=True)

label_encoder = LabelEncoder()
y_train_labels = label_encoder.fit_transform(y_train_labels)
y_val_labels = label_encoder.transform(y_val_labels)
y_test_labels = label_encoder.transform(y_test_labels)

# Step 5: Define Random Forest model and hyperparameter grid
param_grid = {
    'classifier__n_estimators': [5, 50, 100, 200],
    'classifier__max_depth': [3, 5, 7, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2', None]
}

# Step 6: Create a Random Forest pipeline (model with preprocessing)
pipeline = Pipeline(steps=[
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Step 7: Hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)

# Step 8: Train the model using GridSearchCV
grid_search.fit(X_train, y_train_labels)

# Step 9: Retrieve the best model and parameters
print(f"Best parameters: {grid_search.best_params_}")

# Step 10: Make predictions on the test set
y_pred = grid_search.predict(X_test)

# Step 11: Evaluate the model's performance
accuracy = accuracy_score(y_test_labels, y_pred)
classification_rep = classification_report(y_test_labels, y_pred, target_names=label_encoder.classes_)
cm = confusion_matrix(y_test_labels, y_pred)

# Step 12: Display results
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Classification Report:\n{classification_rep}")

# Step 13: Plot Confusion Matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

Fitting 5 folds for each of 192 candidates, totalling 960 fits


KeyboardInterrupt: 

# Model Evaluation and Conclusions

### 1. **Target: Type_Dessert**
- **Accuracy**: 82.41%
- **Key Findings**:
  - The model performs excellently for the majority class ("False") with very high precision (1.00), but struggles with the minority class ("True"), showing very low precision (0.10) and a low F1-score (0.19).
  - Class imbalance is still evident, with the majority class dominating predictions.
  - **Recommendation**: Focus on improving the model's ability to predict the minority class (True) without compromising performance on the majority class.

---

### 2. **Target: Type_Dessert/Port**
- **Accuracy**: 97.65%
- **Key Findings**:
  - The model achieves high accuracy and precision for the majority class ("False") but has low precision (0.29) for the minority class ("True").
  - Recall for the "True" class is very high (0.96), indicating that the model identifies the minority class well, but the low precision indicates a high number of false positives.
  - **Recommendation**: Focus on improving precision for the "True" class while maintaining high recall.

---

### 3. **Target: Type_Red**
- **Accuracy**: 96.27%
- **Key Findings**:
  - The model performs well for both the "False" and "True" classes with balanced precision and recall.
  - High F1-scores for both classes indicate that the model has learned to handle the class imbalance effectively.
  - **Recommendation**: No immediate improvements are needed, but there could still be potential for further fine-tuning to optimize precision on the "False" class.

---

### 4. **Target: Type_Rosé**
- **Accuracy**: 91.77%
- **Key Findings**:
  - High precision for "False" but very low precision for "True".
  - While recall for "True" is high (0.95), the low precision (0.34) and low F1-score (0.51) indicate room for improvement in identifying the "True" class without many false positives.
  - **Recommendation**: Improve the balance between recall and precision for the "True" class.

---

### 5. **Target: Type_Sparkling**
- **Accuracy**: 90.63%
- **Key Findings**:
  - The model is better at predicting the "False" class, with high precision (0.99) and decent recall for the "True" class.
  - However, the F1-score for the "True" class is still low (0.59) due to the class imbalance.
  - **Recommendation**: Focus on improving the performance of the model for the "True" class while maintaining good recall for the "False" class.

---

### 6. **Target: Type_White**
- **Accuracy**: 94.29%
- **Key Findings**:
  - High precision for both classes with a notable improvement in predicting the minority class ("True").
  - Recall for "True" is very high (0.98) and the F1-score (0.91) indicates balanced performance.
  - **Recommendation**: Continue fine-tuning the model to maintain high precision and recall for both classes.

---

### **General Conclusion**:
- **Class Imbalance**: All models still exhibit some degree of bias towards the majority class ("False") due to class imbalance.
- **Next Steps**:
  - **Hyperparameter Tuning**: Continue fine-tuning the hyperparameters, particularly `max_features`, `min_samples_split`, and `n_estimators`, to further improve performance.
  - **Class Imbalance Solutions**: Since SMOTE was excluded from the preprocessing, techniques like adjusting class weights or exploring undersampling methods could help address the imbalance.
  - **Threshold Optimization**: Explore optimizing the decision threshold for classification to balance precision and recall, especially for the minority class.

# Support Vector Regression (SVR) for Wine Price Prediction: Model Performance & Results - Task [091]

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../datasets/WineDataset.csv')

# Clean the 'Price' column (remove currency symbol, commas, and text like 'per bottle')
df['Price'] = df['Price'].replace({'Â£': '', ',': '', ' per bottle': ''}, regex=True)

# Convert the 'Price' column to numeric, coercing errors to NaN (for any unexpected text)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Check for and handle missing values in the 'Price' column
missing_values = df['Price'].isnull().sum()
print(f"Missing values in 'Price' column: {missing_values}")

# Impute missing values for 'Price' with the median value
df['Price'] = df['Price'].fillna(df['Price'].median())

# Feature columns (Grape, Type)
categorical_features = ['Grape', 'Type']
y = df['Price']

# Step 1: One-Hot Encode the categorical features ('Grape', 'Type')
X_categorical = df[categorical_features]
X_encoded = pd.get_dummies(X_categorical, drop_first=True)

# Step 2: Standardize the data (important for SVR)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Step 3: Split data into train and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 4: Hyperparameter Tuning with Loop for different values of 'C', 'epsilon', and 'kernel'
results = []

C_values = [1, 10, 100, 1000]
epsilon_values = [0.01, 0.1, 0.2]
kernels = ['linear', 'rbf', 'poly']

for C in C_values:
    for epsilon in epsilon_values:
        for kernel in kernels:
            # Initialize the SVR model with current hyperparameters
            svr_model = SVR(C=C, epsilon=epsilon, kernel=kernel)
            
            # Train the model
            svr_model.fit(X_train, y_train)
            
            # Predict on the test set
            y_pred = svr_model.predict(X_test)
            
            # Evaluate the model's performance
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            
            # Store the results
            results.append({
                'C': C,
                'epsilon': epsilon,
                'kernel': kernel,
                'MAE': mae,
                'R2': r2
            })

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)
print(results_df)

# Step 5: Find the best parameters from the results
best_params = results_df.loc[results_df['R2'].idxmax()]
print(f"Best parameters: C={best_params['C']}, epsilon={best_params['epsilon']}, kernel={best_params['kernel']}")
print(f"Best R² Score: {best_params['R2']}")

# Step 6: Visualizing the results - Actual vs Predicted Price for the best model
best_model = SVR(C=best_params['C'], epsilon=best_params['epsilon'], kernel=best_params['kernel'])
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred_best, color='blue', label='Predictions')
sns.scatterplot(x=y_test, y=y_test, color='red', label='Actual')
plt.title('SVR - Actual vs Predicted Price')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
plt.show()

# Step 7: Plot the results from grid search (C vs epsilon vs R2 Score)
pivot_table = results_df.pivot_table(index='C', columns='epsilon', values='R2', aggfunc='max')

plt.figure(figsize=(8, 6))
sns.heatmap(pivot_table, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('R² Score Heatmap for Different C and Epsilon Values')
plt.xlabel('Epsilon')
plt.ylabel('C')
plt.show()

## Conclusion

Based on the results of the **Support Vector Regression (SVR)** model with various hyperparameters, the following conclusions can be made:

### Best Model Parameters:
- The best performing model used the following hyperparameters:
  - **C**: 1000
  - **Epsilon**: 0.01
  - **Kernel**: Radial Basis Function (RBF)

### Model Performance:
- The **R² score** for the best model is **0.0582**. This indicates that the model explains only **5.8%** of the variance in the target variable (Price), suggesting that the model has a relatively poor fit to the data.
- The **Mean Absolute Error (MAE)** is approximately **13.34**, meaning the model’s predictions deviate by about **13.34 units** on average, which could be significant depending on the scale of wine prices.

### Model Evaluation:
- Despite tuning the hyperparameters, the performance of the model remains modest, as indicated by the low R² score and moderate MAE.
- The R² score close to zero implies that there is little to no correlation between the features (Grape and Type) and the price of the wine. This suggests that other factors, not included in the model, could have a significant impact on wine pricing.

### Possible Improvements:
- Including additional features (e.g., alcohol content, region, year of production) could enhance the model's predictive power.
- More advanced techniques such as **feature engineering** might reveal hidden relationships in the data.
- Exploring other machine learning models (e.g., Random Forest, Gradient Boosting) or ensemble methods could lead to better results.

### Summary:
In conclusion, the **SVR model** with the best hyperparameters offers some predictive capability but does not adequately explain the variance in wine prices. To improve its accuracy, further refinement of the model, the inclusion of additional features, or the use of different algorithms is recommended.

---

### Explanation for the Context:

**SVR (Support Vector Regression)** is a type of machine learning model used to predict continuous values. 

- **Target**: In this case, the target variable is the **price of wine**.
- **Variables**: The model uses the **grape variety** and **type of wine** as input features to predict the price of the wine.

In summary, this model attempts to predict the price of wine based on its **grape variety** and **type**, but the results indicate that these features alone are not sufficient to explain the variation in wine prices.