In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('/Users/cashcash/Desktop/CSS/3 semester/Advanced Information Retrieval/project/archive/winemag-data-130k-v2.csv')

df

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulk√† Bianco,87,,Sicily & Sardinia,Etna,,Kerin O‚ÄôKeefe,@kerinokeefe,Nicosia 2013 Vulk√† Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Sp√§tlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben M√ºller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben M√ºller-Burggraef)
129967,129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gew√ºrztraminer,Domaine Gresser
129969,129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


In [3]:
print(df.isnull().sum())

Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64


## Data Cleaning

In [4]:
def clean_wine_data(df):
    """
    Complete wine data cleaning pipeline
    """
    # Create a copy to avoid modifying original data
    df_clean = df.copy()
    
    print("Initial data count:", len(df_clean))
    
    # 1. Handle minor missing values (direct removal)
    df_clean = df_clean.dropna(subset=['variety'])  # Only 1 record
    
    # 2. Handle price missing values (hierarchical imputation)
    print("Handling missing prices...")
    df_clean['price'] = df_clean.groupby(['variety', 'country'])['price'].transform(
        lambda x: x.fillna(x.median()) if not x.isnull().all() else x
    )
    global_price_median = df_clean['price'].median()
    df_clean['price'] = df_clean['price'].fillna(global_price_median)
    
    # 3. Handle geographic information missing values
    print("Handling missing geographic data...")
    df_clean['country'] = df_clean['country'].fillna('Unknown_Country')
    df_clean['province'] = df_clean['province'].fillna('Unknown_Province')
    df_clean['region_1'] = df_clean['region_1'].fillna(df_clean['province'] + '_Region')
    
    # 4. Handle taster information
    df_clean['taster_name'] = df_clean['taster_name'].fillna('Unknown_Taster')
    
    # 5. Create core features
    print("Creating features...")
    df_clean['structured_features'] = (
        df_clean['variety'] + ' ' + 
        df_clean['country'] + ' ' + 
        df_clean['region_1']
    )
    
    # Price categorization
    def price_category(price):
        if price < 25: return 'budget'
        elif price < 50: return 'mid_range' 
        else: return 'premium'
    
    df_clean['price_category'] = df_clean['price'].apply(price_category)
    
    print("Cleaning completed! Final data count:", len(df_clean))
    return df_clean

# Execute cleaning
df_cleaned = clean_wine_data(df)

# Verify cleaning results
print("\n=== Missing values after cleaning ===")
print(df_cleaned.isnull().sum())

Initial data count: 129971
Handling missing prices...
Handling missing geographic data...
Creating features...
Cleaning completed! Final data count: 129970

=== Missing values after cleaning ===
Unnamed: 0                   0
country                      0
description                  0
designation              37464
points                       0
price                        0
province                     0
region_1                     0
region_2                 79459
taster_name                  0
taster_twitter_handle    31212
title                        0
variety                      0
winery                       0
structured_features          0
price_category               0
dtype: int64


## Feature Engineering

In [5]:
def create_enhanced_features(df):
    """
    Create enhanced features with better weighting strategy
    Fixed based on team feedback:
    1. Add 'flavor_' prefix to each flavor term
    2. Replace spaces with underscores in variety/country/region names
    3. Ensure consistent feature formatting
    """
    print("üöÄ Creating enhanced features (fixed version)...")
    
    # Helper function to clean text features (replace spaces with underscores)
    def clean_text_feature(text):
        """Replace spaces with underscores in text features"""
        if pd.isna(text):
            return "unknown"
        return str(text).replace(' ', '_')
    
    # Extract flavor features - FIXED: add 'flavor_' prefix to each term
    def extract_flavor_features(description):
        """Extract flavor profiles with proper prefixing"""
        if pd.isna(description):
            return "flavor_neutral"
        
        description_lower = description.lower()
        flavor_terms = []
        
        # Define flavor keywords
        flavor_keywords = {
            'fruit': ['berry', 'cherry', 'apple', 'citrus', 'tropical', 'fruit', 'blackberry', 'raspberry'],
            'dry': ['dry', 'crisp', 'tannic'],
            'sweet': ['sweet', 'honey', 'ripe', 'jam'],
            'oak': ['oak', 'vanilla', 'toast', 'cedar'],
            'spice': ['spice', 'pepper', 'cinnamon', 'clove'],
            'herbal': ['herbal', 'grass', 'mineral', 'earth']
        }
        
        # Check flavor type in description
        for flavor_type, keywords in flavor_keywords.items():
            if any(keyword in description_lower for keyword in keywords):
                flavor_terms.append(f"flavor_{flavor_type}")  # FIXED: Add prefix here!
        
        return ' '.join(flavor_terms) if flavor_terms else "flavor_neutral"
    
    # 1. Clean text features (replace spaces with underscores)
    print("üîÑ Cleaning text features...")
    df['variety_clean'] = df['variety'].apply(clean_text_feature)
    df['country_clean'] = df['country'].apply(clean_text_feature)
    df['region_clean'] = df['region_1'].apply(clean_text_feature)
    df['winery_clean'] = df['winery'].apply(clean_text_feature)
    
    # 2. Extract flavor profiles (with fixed prefixing)
    print("üç∑ Extracting flavor profiles...")
    df['flavor_profile'] = df['description'].apply(extract_flavor_features)
    
    # 3. Feature Engineering - combine all features
    print("üîß Combining features...")
    df['enhanced_features'] = (
        "variety_" + df['variety_clean'] + ' ' +
        "country_" + df['country_clean'] + ' ' +
        "region_" + df['region_clean'] + ' ' +
        "pricecat_" + df['price_category'] + ' ' +
        "winery_" + df['winery_clean'] + ' ' +
        df['flavor_profile']  # Already has 'flavor_' prefixes
    )
    
    # 4. Create TF-IDF features with optimized parameters
    print("üìä Creating TF-IDF vectors...")
    vectorizer = TfidfVectorizer(
        max_features=1500,
        stop_words='english',
        lowercase=True,
        min_df=8,
        max_df=0.6,
        ngram_range=(1, 2),
        analyzer='word'
    )
    
    # FIXED: Train ONLY on enhanced_features column
    feature_vectors = vectorizer.fit_transform(df['enhanced_features'])
    
    # 5. Analyze feature learning
    feature_names = vectorizer.get_feature_names_out()
    variety_features = [f for f in feature_names if f.startswith('variety_')]
    country_features = [f for f in feature_names if f.startswith('country_')]
    region_features = [f for f in feature_names if f.startswith('region_')]
    flavor_features = [f for f in feature_names if f.startswith('flavor_')]
    
    print(f"‚úÖ Enhanced feature matrix: {feature_vectors.shape}")
    print(f"üìä Learned {len(variety_features)} variety features")
    print(f"üìä Learned {len(country_features)} country features") 
    print(f"üìä Learned {len(region_features)} region features")
    print(f"üç∑ Learned {len(flavor_features)} flavor features")
    
    # Show sample features to verify quality
    print(f"üîç Sample variety features: {variety_features[:6]}")
    print(f"üîç Sample country features: {country_features[:4]}")
    print(f"üîç Sample flavor features: {flavor_features[:8]}")
    
    # 6. Analyze flavor distribution (using cleaned data)
    print(f"\nüìà Flavor Profile Distribution:")
    for flavor_type in ['fruit', 'dry', 'sweet', 'oak', 'spice', 'herbal']:
        # Check if flavor_type exists in flavor_profile (with 'flavor_' prefix)
        count = df[df['flavor_profile'].str.contains(f'flavor_{flavor_type}', na=False)].shape[0]
        percentage = (count / len(df)) * 100
        print(f"   {flavor_type}: {count} wines ({percentage:.1f}%)")
    
    # 7. Return cleaned dataframe and feature vectors
    print("\n‚ú® Feature engineering completed successfully!")
    
    return df, feature_vectors, vectorizer

# Execute the fixed version
df_final, feature_vectors, vectorizer = create_enhanced_features(df_cleaned)

üöÄ Creating enhanced features (fixed version)...
üîÑ Cleaning text features...
üç∑ Extracting flavor profiles...
üîß Combining features...
üìä Creating TF-IDF vectors...
‚úÖ Enhanced feature matrix: (129970, 1500)
üìä Learned 258 variety features
üìä Learned 300 country features
üìä Learned 615 region features
üç∑ Learned 21 flavor features
üîç Sample variety features: ['variety_aglianico', 'variety_aglianico country_italy', 'variety_albari√±o', 'variety_albari√±o country_spain', 'variety_albari√±o country_us', 'variety_alvarinho']
üîç Sample country features: ['country_argentina', 'country_argentina region_cafayate', 'country_argentina region_luj√°n_de_cuyo', 'country_argentina region_mendoza']
üîç Sample flavor features: ['flavor_dry', 'flavor_dry flavor_herbal', 'flavor_dry flavor_oak', 'flavor_dry flavor_spice', 'flavor_dry flavor_sweet', 'flavor_fruit flavor_dry', 'flavor_fruit flavor_herbal', 'flavor_fruit flavor_oak']

üìà Flavor Profile Distribution:
   fruit: 109