In [12]:
import pandas as pd

file1_path = 'datasets/cleaned_100k.csv'
data1 = pd.read_csv(file1_path, dtype={'WineID': str}, low_memory=False)

file2_path = '/datasets/cleaned_ratings_file.csv'
data2 = pd.read_csv(file2_path, dtype={'WineID': str}, low_memory=False)

print("Columns in data1:", data1.columns)
print("Columns in data2:", data2.columns)

mean_ratings = data2.groupby('WineID')['Rating'].mean().reset_index()

result = pd.merge(data1, mean_ratings, on='WineID', how='left')

result.rename(columns={'Rating': 'Ratings'}, inplace=True)

print(result.head())

result.to_csv('updated_wines.csv', index=False)



Columns in data1: Index(['WineID', 'WineName', 'Type', 'Elaborate', 'Grapes', 'Harmonize', 'ABV',
       'Body', 'Acidity', 'Code', 'Country', 'RegionID', 'RegionName',
       'WineryID', 'WineryName', 'Website', 'Vintages'],
      dtype='object')
Columns in data2: Index(['RatingID', 'UserID', 'WineID', 'Vintage', 'Rating', 'Date'], dtype='object')
   WineID                         WineName       Type  \
0  100001               Espumante Moscatel  Sparkling   
1  100002                       Ancellotta        Red   
2  100003               Cabernet Sauvignon        Red   
3  100004                   Virtus Moscato      White   
4  100005  Maison de Ville Cabernet-Merlot        Red   

                       Elaborate                            Grapes  \
0                  Varietal/100%                ['Muscat/Moscato']   
1                  Varietal/100%                    ['Ancellotta']   
2                  Varietal/100%            ['Cabernet Sauvignon']   
3                  Varieta

# Data Preprocessing

## Purpose

1. Splitting text components for better granularity.
2. Normalizing numerical values to ensure consistency.
3. Removing unnecessary symbols or irrelevant text.

The preprocessing will address these specific attributes in the dataset:
- **Style**
- **Characteristics**
- **Price**
- **Capacity**
- **ABV (Alcohol by Volume)**
- **Vintage**

In [37]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import datetime
import re

# Load the dataset
file_path = 'datasets/WineDataset.csv'
df = pd.read_csv(file_path)

def convert_to_liters(capacity):
    capacity = str(capacity).strip().upper()
    if 'CL' in capacity:  # Centiliters to Liters
        return float(re.sub(r'[^\d.]', '', capacity)) / 100
    elif 'ML' in capacity:  # Milliliters to Liters
        return float(re.sub(r'[^\d.]', '', capacity)) / 1000
    elif 'LITRE' in capacity or 'L' in capacity:  # Liters already
        return float(re.sub(r'[^\d.]', '', capacity))
    elif 'LTR' in capacity or 'L' in capacity:  # Liters already
        return float(re.sub(r'[^\d.]', '', capacity))
    elif 'L' in capacity or 'L' in capacity:  # Liters already
        return float(re.sub(r'[^\d.]', '', capacity))
    else:
        return ''  # Handle any unknown format

def preprocess_data(df):

    numeric_cols = ['Price', 'ABV', 'Capacity']

    df['Capacity'] = df['Capacity'].apply(convert_to_liters)

    if not df.empty:
        for col in numeric_cols:
            if col in df.columns:
                # Remove non-numeric characters and convert to float
                df[col] = df[col].apply(lambda x: re.sub(r'[^\d.]', '', str(x)).strip() if str(x).strip() else np.nan)
                df[col] = pd.to_numeric(df[col], errors='coerce')
                
                if df[col].notnull().any():  # Check if there's valid data for scaling
                    scaler = MinMaxScaler()
                    df[col] = scaler.fit_transform(df[[col]])
                
                df[col] = df[col].round(3)

        # Clean and split the 'Style' column
        if 'Style' in df.columns:
            df['Style'] = (
                df['Style']
                .str.replace(r'[^\w\s&]', '', regex=True)
                .str.split('&')
                .apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)  # Clean whitespace
            )

        # Clean and split the 'Characteristics' column
        if 'Characteristics' in df.columns:
            df['Characteristics'] = (
                df['Characteristics']
                .str.replace(r'[^\w\s,]', '', regex=True)
                .str.split(',') 
                .apply(lambda x: [item.strip() for item in x] if isinstance(x, list) else x)  # Clean whitespace
            )
            
        # Clean and normalize the 'Vintage' column
        if 'Vintage' in df.columns:
            current_year = datetime.datetime.now().year

            df['Vintage'] = df['Vintage'].apply(
                lambda x: current_year if str(x).strip().upper() == 'NV' else (int(re.search(r'\d{4}', str(x)).group(0)) if re.search(r'\d{4}', str(x)) else np.nan)
            )

            valid_years = df['Vintage'][df['Vintage'] > 1900]
            if not valid_years.empty:

                min_year = valid_years.min()  
                max_year = current_year

                # Calculates the vintage value based on the max vintage and the current year
                df['Vintage'] = df['Vintage'].apply(
                    lambda x: max(0, (x - max_year) / (min_year - max_year)) if pd.notna(x) else np.nan
                )

                # Round the 'Vintage' values to 2 decimal places
                df['Vintage'] = df['Vintage'].round(2)

    return df

# Preprocess the dataset
df_cleaned = preprocess_data(df)

# Save or display the cleaned dataset
df_cleaned.to_csv('datasets/cleaned_wines.csv', index=False)
df_cleaned.head()


Unnamed: 0,Title,Description,Price,Capacity,Grape,Secondary Grape Varieties,Closure,Country,Unit,Characteristics,Per bottle / case / each,Type,ABV,Region,Style,Vintage,Appellation
0,"The Guv'nor, Spain",We asked some of our most prized winemakers wo...,0.012,0.081,Tempranillo,,Natural Cork,Spain,10.5,"[Vanilla, Blackberry, Blackcurrant]",per bottle,Red,0.342,,"[Rich, Juicy]",0.0,
1,Bread & Butter 'Winemaker's Selection' Chardon...,This really does what it says on the tin. Itâ€...,0.026,0.081,Chardonnay,,Natural Cork,USA,10.1,"[Vanilla, Almond, Coconut, Green Apple, Peach,...",per bottle,White,0.329,California,"[Rich, Toasty]",0.12,Napa Valley
2,"Oyster Bay Sauvignon Blanc 2022, Marlborough",Oyster Bay has been an award-winning gold-stan...,0.018,0.081,Sauvignon Blanc,,Screwcap,New Zealand,9.8,"[Tropical Fruit, Gooseberry, Grapefruit, Grass...",per bottle,White,0.316,Marlborough,"[Crisp, Zesty]",0.08,
3,Louis Latour MÃ¢con-Lugny 2021/22,Weâ€™ve sold this wine for thirty years â€“ an...,0.031,0.081,Chardonnay,,Natural Cork,France,10.1,"[Peach, Apricot, Floral, Lemon]",per bottle,White,0.329,Burgundy,"[Ripe, Rounded]",0.08,Macon
4,Bread & Butter 'Winemaker's Selection' Pinot N...,Bread & Butter is that thing that you can coun...,0.026,0.081,Pinot Noir,,Natural Cork,USA,10.1,"[Smoke, Black Cherry, Cedar, Raspberry, Red Fr...",per bottle,Red,0.329,California,"[Smooth, Mellow]",0.12,Napa Valley


In [46]:
import pandas as pd

# Load your dataset
file_path = 'datasets/cleaned_wines.csv'
df = pd.read_csv(file_path)

# Get unique values in the 'Capacity' column
unique_values = df['Capacity'].dropna().unique()

# Print the unique values
print(unique_values)


[0.081 0.07  0.243 0.    0.405 0.027 0.568 1.   ]


In [31]:
import pandas as pd

file2_path = '/datasets/XWines_Full_100k_wines.csv'

data2 = pd.read_csv(file2_path)

data2['WineID'] = pd.to_numeric(data2['WineID'], errors='coerce')

cleaned_data2 = data2.dropna(subset=['WineID'])

cleaned_file_path = 'cleaned_100k.csv'
cleaned_data2.to_csv(cleaned_file_path, index=False)

print(cleaned_data2.head())



FileNotFoundError: [Errno 2] No such file or directory: '/datasets/XWines_Full_100k_wines.csv'

In [13]:
import pandas as pd

file2_path = '/datasets/updated_wines.csv'
data2 = pd.read_csv(file2_path, dtype={'WineID': str}, low_memory=False)

invalid_rows = data2[pd.to_numeric(data2['Ratings'], errors='coerce').isna()]

print("Rows with invalid or empty 'Ratings':")
print(invalid_rows)


Rows with invalid or empty 'Ratings':
Empty DataFrame
Columns: [WineID, WineName, Type, Elaborate, Grapes, Harmonize, ABV, Body, Acidity, Code, Country, RegionID, RegionName, WineryID, WineryName, Website, Vintages, Ratings]
Index: []


In [14]:
import pandas as pd

file2_path = 'datasets/updated_wines.csv'

data2 = pd.read_csv(file2_path, dtype={'WineID': str}, low_memory=False)

data2['Ratings'] = pd.to_numeric(data2['Ratings'], errors='coerce')

data2['Ratings'] = data2['Ratings'].apply(lambda x: f"{x:.2f}" if pd.notna(x) else "")

data2.to_csv(file2_path, index=False)

print(f"Ratings formatted to two decimal places. The original file has been updated: {file2_path}")


Ratings formatted to two decimal places. The original file has been updated: /Users/ruiribeiro/Desktop/MEIA_ISEP_TEAM7_2024_2025_Challenge_2/updated_wines.csv
