### Importing the dataset

In [152]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [153]:
wines = pd.read_csv("XWines_Test_100_wines_1K_ratings\XWines_Test_100_wines.csv", encoding="utf-8")
ratings = pd.read_csv("XWines_Test_100_wines_1K_ratings\XWines_Test_1K_ratings.csv", low_memory=False)

In [154]:
wines.head(2)

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName,Website,Vintages
0,100062,Origem Merlot,Red,Varietal/100%,['Merlot'],"['Beef', 'Lamb', 'Veal', 'Grilled', 'Pizza', '...",13.0,Full-bodied,Medium,BR,Brazil,1002,Vale dos Vinhedos,10014,Casa Valduga,http://www.casavalduga.com.br,"[2020, 2019, 2018, 2017, 2016, 2015, 2014, 201..."
1,100191,Reserva Chardonnay,White,Varietal/100%,['Chardonnay'],"['Rich Fish', 'Seafood', 'Risotto', 'Poultry',...",13.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10000,Aurora,http://www.vinicolaaurora.com.br,"[2021, 2020, 2019, 2018, 2017, 2016, 2015, 201..."


In [155]:
ratings.head(5)

Unnamed: 0,RatingID,UserID,WineID,Vintage,Rating,Date
0,3211,1209683,111478,1959,4.5,2016-08-08 00:50:22
1,27878,1209980,111478,1975,4.0,2018-08-12 17:09:39
2,31227,1258705,111478,1975,5.0,2014-11-16 19:52:38
3,41946,1139706,111478,1979,5.0,2014-12-22 02:30:15
4,61700,1240747,111478,1982,4.5,2019-10-21 02:01:10


### Preprocessing

#### Selecting the variables

In [156]:
wines.columns

Index(['WineID', 'WineName', 'Type', 'Elaborate', 'Grapes', 'Harmonize', 'ABV',
       'Body', 'Acidity', 'Code', 'Country', 'RegionID', 'RegionName',
       'WineryID', 'WineryName', 'Website', 'Vintages'],
      dtype='object')

In [157]:
wines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   WineID      100 non-null    int64  
 1   WineName    100 non-null    object 
 2   Type        100 non-null    object 
 3   Elaborate   100 non-null    object 
 4   Grapes      100 non-null    object 
 5   Harmonize   100 non-null    object 
 6   ABV         100 non-null    float64
 7   Body        100 non-null    object 
 8   Acidity     100 non-null    object 
 9   Code        100 non-null    object 
 10  Country     100 non-null    object 
 11  RegionID    100 non-null    int64  
 12  RegionName  100 non-null    object 
 13  WineryID    100 non-null    int64  
 14  WineryName  100 non-null    object 
 15  Website     100 non-null    object 
 16  Vintages    100 non-null    object 
dtypes: float64(1), int64(3), object(13)
memory usage: 13.4+ KB


In [158]:
wines = wines[
    [
        "WineID",
        "WineName",
        "Type",
        "Elaborate",
        "Grapes",
        "Harmonize",
        "ABV",
        "Body",
        "Acidity",
        "Country",
        "RegionID",
        "WineryID",
    ]
]

#### Replacing special characters

In [159]:
wines.head(3)

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Country,RegionID,WineryID
0,100062,Origem Merlot,Red,Varietal/100%,['Merlot'],"['Beef', 'Lamb', 'Veal', 'Grilled', 'Pizza', '...",13.0,Full-bodied,Medium,Brazil,1002,10014
1,100191,Reserva Chardonnay,White,Varietal/100%,['Chardonnay'],"['Rich Fish', 'Seafood', 'Risotto', 'Poultry',...",13.0,Medium-bodied,Medium,Brazil,1001,10000
2,101847,Dona Antonia Porto Reserva Tawny,Dessert/Port,Assemblage/Blend,"['Touriga Nacional', 'Touriga Franca', 'Tinta ...","['Appetizer', 'Sweet Dessert', 'Blue Cheese']",20.0,Very full-bodied,High,Portugal,1031,10674


In [160]:
wines['Grapes'].head()

0                                           ['Merlot']
1                                       ['Chardonnay']
2    ['Touriga Nacional', 'Touriga Franca', 'Tinta ...
3    ['Tinta Amarela', 'Tinta Barroca', 'Touriga Fr...
4                  ['Loureiro', 'Alvarinho', 'Arinto']
Name: Grapes, dtype: object

In [161]:
wines['Grapes'] = wines['Grapes'].str.replace("[", " ").str.replace("]", " ").str.replace("'", "").str.strip()
wines['Grapes'].head()

0                                               Merlot
1                                           Chardonnay
2    Touriga Nacional, Touriga Franca, Tinta Barroc...
3    Tinta Amarela, Tinta Barroca, Touriga Franca, ...
4                          Loureiro, Alvarinho, Arinto
Name: Grapes, dtype: object

In [162]:
wines['Body'] = wines['Body'].str.replace('-', '')
wines['Body'].head()

0          Fullbodied
1        Mediumbodied
2     Very fullbodied
3     Very fullbodied
4    Very lightbodied
Name: Body, dtype: object

In [163]:
wines["Harmonize"] = wines["Harmonize"].str.replace("[", " ").str.replace("]", " ").str.replace("'", "").str.strip()
wines["Harmonize"].head()

0             Beef, Lamb, Veal, Grilled, Pizza, Pasta
1    Rich Fish, Seafood, Risotto, Poultry, Vegetarian
2               Appetizer, Sweet Dessert, Blue Cheese
3             Sweet Dessert, Cake, Fruit, Soft Cheese
4       Fish, Shellfish, Vegetarian, Appetizer, Snack
Name: Harmonize, dtype: object

In [164]:
wines.head(3)

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,Harmonize,ABV,Body,Acidity,Country,RegionID,WineryID
0,100062,Origem Merlot,Red,Varietal/100%,Merlot,"Beef, Lamb, Veal, Grilled, Pizza, Pasta",13.0,Fullbodied,Medium,Brazil,1002,10014
1,100191,Reserva Chardonnay,White,Varietal/100%,Chardonnay,"Rich Fish, Seafood, Risotto, Poultry, Vegetarian",13.0,Mediumbodied,Medium,Brazil,1001,10000
2,101847,Dona Antonia Porto Reserva Tawny,Dessert/Port,Assemblage/Blend,"Touriga Nacional, Touriga Franca, Tinta Barroc...","Appetizer, Sweet Dessert, Blue Cheese",20.0,Very fullbodied,High,Portugal,1031,10674


In [165]:
def check_special_characters(df, column_name, special_characters_pattern):
    rows_with_special_characters = df[df[column_name].str.contains(special_characters_pattern, regex=True, na=False)]
    row_count = len(rows_with_special_characters)
    print("Number of rows with special characters in {}: {}".format(column_name, row_count))
    print(rows_with_special_characters[[column_name]].drop_duplicates().sort_values(by=column_name))

In [166]:
import re
def remove_special_characters(df, column_name, characters_to_remove):
    df[column_name] = df[column_name].str.replace(f'[{re.escape(characters_to_remove)}]', ' ', regex=True)
    return df

In [167]:
characters_to_remove = '[!"#$%°&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [168]:
check_special_characters(wines, 'WineName', characters_to_remove)

Number of rows with special characters in WineName: 20
                                        WineName
53                                Barbera d'Alba
48                             Brachetto d'Acqui
75                     Cabernet Sauvignon-Shiraz
35                   Chablis 1er Cru 'Montmains'
25          Château Chemin Royal Moulis-en-Médoc
29                        Cinsault-Grenache Rosé
22                Coteaux d'Aix-en-Provence Rosé
50                         Dulcis Moscato d'Asti
31           Les Fuées Chambolle-Musigny 1er Cru
34                        Marc d'Alsace Riesling
37     Meursault 1er Cru 'La Pièce Sous Le Bois'
40                        Nicolon Barbera d'Alba
28                 Nuits-St-Georges Les Plateaux
24                                Pessac-Léognan
21  Pommard Premier Cru 'Grand Clos des Épenots'
63   Rare Cream Sherry Superior (Solera Reserva)
47                         Riserva Prêt-A-Porter
19          Sauternes (Premier Grand Cru Classé)
52            

In [169]:
wines = remove_special_characters(wines, 'WineName', characters_to_remove)
check_special_characters(wines, 'WineName', characters_to_remove)

Number of rows with special characters in WineName: 0
Empty DataFrame
Columns: [WineName]
Index: []


#### Null values

In [170]:
wines.isnull().sum()

WineID       0
WineName     0
Type         0
Elaborate    0
Grapes       0
Harmonize    0
ABV          0
Body         0
Acidity      0
Country      0
RegionID     0
WineryID     0
dtype: int64

#### Splitting list variables

In [171]:
def split_and_expand_columns(df, column_name, delimiter="/"):
    expanded_columns = df[column_name].str.split(delimiter, expand=True)
    new_columns = [f"{column_name}{i+1}" for i in range(expanded_columns.shape[1])]
    expanded_columns.columns = new_columns

    df = pd.concat([df, expanded_columns], axis=1)
    df = df.fillna("")
    df = df.drop(columns=[column_name])

    return df

In [172]:
wines['Harmonize'].head()

0             Beef, Lamb, Veal, Grilled, Pizza, Pasta
1    Rich Fish, Seafood, Risotto, Poultry, Vegetarian
2               Appetizer, Sweet Dessert, Blue Cheese
3             Sweet Dessert, Cake, Fruit, Soft Cheese
4       Fish, Shellfish, Vegetarian, Appetizer, Snack
Name: Harmonize, dtype: object

In [173]:
wines = split_and_expand_columns(wines, "Harmonize", delimiter=", ")

In [174]:
wines.iloc[:, -6:].head()

Unnamed: 0,Harmonize1,Harmonize2,Harmonize3,Harmonize4,Harmonize5,Harmonize6
0,Beef,Lamb,Veal,Grilled,Pizza,Pasta
1,Rich Fish,Seafood,Risotto,Poultry,Vegetarian,
2,Appetizer,Sweet Dessert,Blue Cheese,,,
3,Sweet Dessert,Cake,Fruit,Soft Cheese,,
4,Fish,Shellfish,Vegetarian,Appetizer,Snack,


In [175]:
wines = split_and_expand_columns(wines, "Grapes", delimiter=", ")

In [176]:
wines.sample(15).iloc[:, -9:]

Unnamed: 0,Grapes1,Grapes2,Grapes3,Grapes4,Grapes5,Grapes6,Grapes7,Grapes8,Grapes9
45,Negroamaro,Merlot,Refosco,,,,,,
48,Brachetto,,,,,,,,
6,Alicante Bouschet,Aragonez,Trincadeira,,,,,,
2,Touriga Nacional,Touriga Franca,Tinta Barroca,Tinta Amarela,,,,,
12,Azal Tinto,Vinhão,Padeiro,,,,,,
42,Chardonnay,,,,,,,,
10,Touriga Franca,Touriga Nacional,Tinta Roriz,Tinta Barroca,,,,,
29,Grenache,Cinsault,,,,,,,
50,Muscat/Moscato,,,,,,,,
60,Fiano,,,,,,,,


In [177]:
wines['Elaborate'].value_counts()

Elaborate
Varietal/100%                        58
Assemblage/Blend                     25
Varietal/>75%                         5
Assemblage/Bordeaux Red Blend         4
Assemblage/Portuguese Red Blend       2
Assemblage/Provence Rosé Blend        2
Assemblage/Valpolicella Red Blend     2
Assemblage/Port Blend                 1
Assemblage/Rhône Red Blend            1
Name: count, dtype: int64

In [178]:
wines = split_and_expand_columns(wines, "Elaborate")

In [179]:
wines.sample(10).iloc[:, -2:]

Unnamed: 0,Elaborate1,Elaborate2
16,Varietal,100%
15,Assemblage,Blend
75,Assemblage,Blend
70,Varietal,100%
4,Assemblage,Blend
82,Varietal,>75%
52,Varietal,100%
53,Varietal,100%
73,Assemblage,Blend
78,Varietal,100%


In [180]:
wines['Type'].value_counts()

Type
Red             53
White           19
Rosé             8
Dessert          8
Sparkling        7
Dessert/Port     5
Name: count, dtype: int64

In [181]:
wines = split_and_expand_columns(wines, "Type")

In [182]:
wines.sample(10).iloc[:, -2:]

Unnamed: 0,Type1,Type2
57,Red,
39,Red,
32,Dessert,
79,Red,
16,White,
46,White,
98,White,
29,Rosé,
83,Dessert,
56,White,


#### Alcohol beverage outliers

In [183]:
wines['ABV'].describe()

count    100.000000
mean      13.659000
std        4.258375
min        4.000000
25%       12.875000
50%       13.350000
75%       14.300000
max       48.000000
Name: ABV, dtype: float64

In [184]:
wines['ABV'].value_counts()

ABV
13.0    24
13.5     9
14.0     9
14.5     8
12.5     8
12.0     5
15.5     4
11.5     3
20.0     3
15.0     3
19.5     2
14.1     2
14.3     2
4.0      1
14.8     1
14.6     1
14.7     1
13.4     1
14.2     1
13.6     1
13.3     1
5.5      1
5.0      1
9.5      1
6.0      1
48.0     1
13.8     1
11.2     1
10.5     1
10.0     1
11.0     1
Name: count, dtype: int64

In [185]:
def transform_values(df, column_name):
    df[column_name] = np.where(df[column_name] > 15, 15, df[column_name])
    df[column_name] = np.where(df[column_name] < 8, 8, df[column_name])
    return df

In [186]:
wines = transform_values(wines, 'ABV')

In [187]:
wines['ABV'].describe()

count    100.000000
mean      13.184000
std        1.568421
min        8.000000
25%       12.875000
50%       13.350000
75%       14.300000
max       15.000000
Name: ABV, dtype: float64

In [188]:
wines['ABV'].value_counts()

ABV
13.0    24
15.0    13
14.0     9
13.5     9
14.5     8
12.5     8
12.0     5
8.0      4
11.5     3
14.1     2
14.3     2
13.8     1
11.2     1
10.5     1
9.5      1
13.3     1
13.6     1
14.2     1
13.4     1
10.0     1
14.7     1
14.6     1
14.8     1
11.0     1
Name: count, dtype: int64

### Exporting

In [189]:
wines.to_csv('XWines_new.csv', index=False)

Combine variables into one column and do tf-idf

In [190]:
wines = wines.fillna("")

In [191]:
common_columns = ['WineName', 'Body', 'Acidity', 'Country']
grapes_columns = [f'Grapes{i}' for i in range(1, 10)]
harmonize_columns = [f'Harmonize{i}' for i in range(1, 7)]
type_columns = [f'Type{i}' for i in range(1, 3)]
elaborate_columns = [f'Elaborate{i}' for i in range(1, 3)]
all_columns = common_columns + grapes_columns + harmonize_columns + type_columns + elaborate_columns
wines['Attributes'] = wines[all_columns].astype(str).agg(' '.join, axis=1)

In [196]:
wines.head(3)

Unnamed: 0,WineID,ABV,RegionID,WineryID,Attributes
0,100062,13.0,1002,10014,Origem Merlot Fullbodied Medium Brazil Merlot ...
1,100191,13.0,1001,10000,Reserva Chardonnay Mediumbodied Medium Brazil ...
2,101847,15.0,1031,10674,Dona Antonia Porto Reserva Tawny Very fullbodi...


In [193]:
wines.drop(columns=all_columns, inplace=True)

In [197]:
wines.to_csv('XWines_with_arrtibutes.csv', index=False)

Vectorize attribute:

In [194]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(wines['Attributes'])
tfidf_matrix.shape

(100, 375)

In [195]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [198]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [199]:
# Load datasets
wines = pd.read_csv("wines_features.csv", encoding="utf-8")
ratings = pd.read_csv("XWines_Test_100_wines_1K_ratings\XWines_Test_1K_ratings.csv", low_memory=False)

In [200]:
wines.head(2)

Unnamed: 0,WineID,WineName,Type,Elaborate,Grapes,ABV,Body,Acidity,Code,Country,RegionID,RegionName,WineryID,WineryName,Aperitif,Appetizer,Barbecue,Beef,Blue Cheese,Cake,Cheese,Chicken,Chocolate,Codfish,Cold Cuts,Cream,Cured Meat,Dessert,Duck,Fish,French Fries,Fruit,Fruit Dessert,Game Meat,Goat Cheese,Grilled,Ham,Hard Cheese,Lamb,Lean Fish,Light Stews,Maturated Cheese,Mushrooms,Pasta,Pizza,Pork,Poultry,Rich Fish,Risotto,Salad,Seafood,Shellfish,Snack,Soft Cheese,Soufflé,Spicy Food,Sweet Dessert,Tomato Dishes,Veal,Vegetarian,avg_rating,total_ratings
0,100001,Espumante Moscatel,Sparkling,Varietal/100%,Muscat/Moscato,7.5,Medium-bodied,High,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,3.838462,520
1,100002,Ancellotta,Red,Varietal/100%,Ancellotta,12.0,Medium-bodied,Medium,BR,Brazil,1001,Serra Gaúcha,10001,Casa Perini,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3.25,4
