In [1]:
import pandas as pd

In [2]:
# wines = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_1K_wines.csv", encoding="utf-8")
# ratings = pd.read_csv("XWines_Slim_1K_wines_150K_ratings\XWines_Slim_150K_ratings.csv", low_memory=False)

wines = pd.read_csv("All-XWines_Full_100K_wines_21M_ratings\XWines_Full_100K_wines.csv", encoding="utf-8")
ratings = pd.read_csv("All-XWines_Full_100K_wines_21M_ratings\XWines_Full_21M_ratings.csv", low_memory=False)

In [3]:
wines = wines[
    [
        "WineName",
        "Type",
        "Elaborate",
        "Grapes",
        "Harmonize",
        "Body",
        "Acidity",
        "Country",
        "RegionName",
        "WineryName"
    ]
]

In [4]:
wines['Grapes'] = wines['Grapes'].str.replace("[", " ").str.replace("]", " ").str.replace("'", "").str.strip()
wines['Grapes'].head()

0                Muscat/Moscato
1                    Ancellotta
2            Cabernet Sauvignon
3                Muscat/Moscato
4    Cabernet Sauvignon, Merlot
Name: Grapes, dtype: object

In [5]:
wines['Body'] = wines['Body'].str.replace('-', '')
wines['Body'].head()

0    Mediumbodied
1    Mediumbodied
2      Fullbodied
3    Mediumbodied
4      Fullbodied
Name: Body, dtype: object

In [6]:
wines["Harmonize"] = wines["Harmonize"].str.replace("[", " ").str.replace("]", " ").str.replace("'", "").str.strip()
wines["Harmonize"].head()

0                       Pork, Rich Fish, Shellfish
1    Beef, Barbecue, Codfish, Pasta, Pizza, Cheese
2                              Beef, Lamb, Poultry
3                                    Sweet Dessert
4                   Beef, Lamb, Game Meat, Poultry
Name: Harmonize, dtype: object

In [7]:
def check_special_characters(df, column_name, special_characters_pattern):
    rows_with_special_characters = df[df[column_name].str.contains(special_characters_pattern, regex=True, na=False)]
    row_count = len(rows_with_special_characters)
    print("Number of rows with special characters in {}: {}".format(column_name, row_count))
    print(rows_with_special_characters[[column_name]].drop_duplicates().sort_values(by=column_name))

In [8]:
import re
def remove_special_characters(df, column_name, characters_to_remove):
    df[column_name] = df[column_name].str.replace(f'[{re.escape(characters_to_remove)}]', ' ', regex=True)
    return df

In [9]:
characters_to_remove = '[!"#$%°&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [10]:
check_special_characters(wines, 'WineName', characters_to_remove)

Number of rows with special characters in WineName: 21435
                                               WineName
10426  !Vin Forster Ungeheuer Riesling Spätlese Trocken
81313                            #42 Meritage Red Blend
19474                            #Lou Côtes de Provence
97870       #Дичь Совиньон Блан (#Wild Sauvignon Blanc)
96732                               #ДляТебя (#For You)
...                                                 ...
93311                                   ナイアガラ (Niagara)
93346                         完熟甘口 ロゼ (Ripe Sweet Rosé)
93323                    実りの収穫 白 辛口 (Harvest White Dry)
93316                                        甲州 (Koshu)
93338                   限定醸造 甲州 (Limited Brewing Koshu)

[14528 rows x 1 columns]


In [11]:
wines = remove_special_characters(wines, 'WineName', characters_to_remove)
check_special_characters(wines, 'WineName', characters_to_remove)

Number of rows with special characters in WineName: 0
Empty DataFrame
Columns: [WineName]
Index: []


In [12]:
wines = remove_special_characters(wines, 'RegionName', characters_to_remove)
check_special_characters(wines, 'RegionName', characters_to_remove)

Number of rows with special characters in RegionName: 0
Empty DataFrame
Columns: [RegionName]
Index: []


In [13]:
wines = remove_special_characters(wines, 'WineryName', characters_to_remove)
check_special_characters(wines, 'WineryName', characters_to_remove)

Number of rows with special characters in WineryName: 0
Empty DataFrame
Columns: [WineryName]
Index: []


In [14]:
wines.isnull().sum()

WineName      0
Type          0
Elaborate     0
Grapes        0
Harmonize     0
Body          0
Acidity       0
Country       0
RegionName    0
WineryName    0
dtype: int64

In [15]:
wines['Type'] = wines['Type'].str.replace('/', ' ')
wines['Elaborate'] = wines['Elaborate'].str.replace('/', ' ')
wines['Grapes'] = wines['Grapes'].str.replace('/', ' ')


In [16]:
wines['Harmonize'] = wines['Harmonize'].str.replace(', ', ' ')

In [17]:
wines['Grapes'] = wines['Grapes'].str.replace(', ', ' ')

In [18]:
wines['Elaborate'] = wines['Elaborate'].str.replace('100%', '')

In [19]:
print(wines.head(3).to_markdown())

|    | WineName           | Type      | Elaborate   | Grapes             | Harmonize                                | Body         | Acidity   | Country   | RegionName   | WineryName   |
|---:|:-------------------|:----------|:------------|:-------------------|:-----------------------------------------|:-------------|:----------|:----------|:-------------|:-------------|
|  0 | Espumante Moscatel | Sparkling | Varietal    | Muscat Moscato     | Pork Rich Fish Shellfish                 | Mediumbodied | High      | Brazil    | Serra Gaúcha | Casa Perini  |
|  1 | Ancellotta         | Red       | Varietal    | Ancellotta         | Beef Barbecue Codfish Pasta Pizza Cheese | Mediumbodied | Medium    | Brazil    | Serra Gaúcha | Casa Perini  |
|  2 | Cabernet Sauvignon | Red       | Varietal    | Cabernet Sauvignon | Beef Lamb Poultry                        | Fullbodied   | High      | Brazil    | Serra Gaúcha | Castellamare |


In [20]:
wines.to_csv('wines.json', index=False)

In [21]:
# Initialize an empty list to hold the corpus
corpus = []

# Iterate through each row in the dataframe
for index, row in wines.iterrows():
    words = []
    for column in wines.columns:
        # Assuming each cell contains a string of words/phrases
        words.extend(word.lower() for word in row[column].split())  # Split the string into individual words/phrases and convert to lowercase
    corpus.append(words)

In [22]:
print(corpus[0])

['espumante', 'moscatel', 'sparkling', 'varietal', 'muscat', 'moscato', 'pork', 'rich', 'fish', 'shellfish', 'mediumbodied', 'high', 'brazil', 'serra', 'gaúcha', 'casa', 'perini']


In [23]:
import json

# Save the corpus to a JSON file
with open('corpus.json', 'w') as file:
    json.dump(corpus, file)

print("Corpus saved to corpus.json")

Corpus saved to corpus.json


In [25]:
import numpy as np
def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load the GloVe embeddings (use the correct path to glove.6B.100d.txt)
glove_embeddings = load_glove_embeddings("glove.6B.100d.txt")


In [26]:
# Flatten your corpus into a list of unique words
corpus_flat = set(word for sentence in corpus for word in sentence)

# Identify OOV words (words not found in GloVe embeddings)
oov_words = [word for word in corpus_flat if word.lower() not in glove_embeddings]

print(f"Number of OOV words: {len(oov_words)}")
print(f"Some OOV words: {oov_words[:10]}")


Number of OOV words: 27367
Some OOV words: ['apostelhoeve', 'kaiora', 'saffirio', 'esquila', 'morissio', 'maresco', 'leglise', 'bisquertt', 'düppel', 'versátil']


In [27]:
from gensim.models import Word2Vec

# Train Word2Vec model on the corpus to learn embeddings for OOV words
word2vec_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=10)

# Get embeddings for OOV words
oov_embeddings = {word: word2vec_model.wv[word] for word in oov_words}

print(f"Sample OOV word embedding: {oov_embeddings[oov_words[0]]}")


Sample OOV word embedding: [-3.6973551e-02  6.1116549e-03 -3.1181129e-03 -1.1834876e-03
 -5.1282956e-03 -5.4737227e-03  2.1144595e-02  3.2943331e-02
  9.2102811e-03 -1.1437621e-02  9.6056676e-03  6.9698453e-04
  7.2877090e-03  6.9649788e-03 -2.6366498e-02 -3.3838782e-03
  1.3778426e-02 -8.6173052e-03  1.5774008e-02 -9.3775466e-03
 -5.3222487e-03  2.8995413e-03  1.4695640e-02  3.2706730e-02
 -1.7467400e-02  1.6450884e-02  2.2940093e-03  6.2082848e-03
 -2.4486054e-02 -3.1330134e-03  1.6579291e-02 -1.2886627e-03
 -2.4687003e-03 -1.6258817e-02 -8.8629499e-03  1.0734662e-02
  7.5168824e-03  5.1195589e-03  1.1173854e-02 -6.8944390e-03
  2.4636654e-02 -1.2731231e-02 -2.1951059e-02  4.1299406e-03
 -2.1733660e-02 -9.9974126e-03  1.2963517e-02  1.2326274e-02
 -4.3581710e-03  1.4111800e-02  3.0347452e-02 -8.6685866e-03
 -1.7825153e-02  1.4331244e-02  1.1993338e-03  1.1887629e-02
  6.6857017e-03 -2.8683390e-02 -1.0807372e-02 -3.3335172e-04
 -1.5140613e-02  3.5930190e-03  1.9154662e-03 -8.1963195e-

In [28]:
# Merge GloVe and OOV embeddings
final_embeddings = glove_embeddings.copy()
final_embeddings.update(oov_embeddings)

print(f"Total words in final embeddings: {len(final_embeddings)}")


Total words in final embeddings: 427367


In [29]:
import pickle

# Save the combined embeddings
with open('combined_glove_oov_embeddings.pkl', 'wb') as f:
    pickle.dump(final_embeddings, f)


In [30]:
# Function to retrieve a vector for a given word
def get_word_vector(word):
    word = word.lower()  # Make sure to handle case sensitivity
    if word in final_embeddings:
        return final_embeddings[word]
    else:
        print(f"'{word}' not found in the embeddings.")
        return None

# Example: Retrieve a vector for a specific wine name or word
word_to_lookup = "aquarela"  # Replace with specific word
vector = get_word_vector(word_to_lookup)

if vector is not None:
    print(f"Vector for '{word_to_lookup}': {vector}")


Vector for 'aquarela': [-0.03038387 -0.01121894 -0.01346635  0.02557348 -0.00360102 -0.0027586
 -0.00126475  0.01695323  0.00525266 -0.00320059  0.00241605  0.01213203
  0.00026962  0.02358433 -0.02203885 -0.00280847  0.0051872   0.0109082
 -0.00180579 -0.00077052 -0.00664009  0.00309005  0.00454308  0.02180259
 -0.00268304 -0.00391799  0.00621556  0.03396574 -0.02207505  0.01045749
 -0.00098376 -0.00345203 -0.00041609 -0.01064812 -0.00385459 -0.00128129
  0.00556299  0.00384221  0.017803   -0.01698707  0.00144671 -0.00844301
 -0.0154839  -0.00359151  0.00056433 -0.01112938  0.02566032  0.01553847
 -0.0101446   0.00502182 -0.00251705 -0.00176857 -0.00326272 -0.01174307
 -0.01402416  0.02284874 -0.00078858 -0.01730207  0.00726611  0.01480921
 -0.01803309  0.01335229  0.00483819  0.01200896 -0.0314816  -0.0068158
 -0.01301767  0.00288041 -0.00848341  0.00408786  0.02498866  0.01487888
  0.01315355  0.01069675  0.02209724  0.01142865 -0.00489316  0.0197099
  0.01416789  0.00791656 -0.0132