In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sebau12/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
np.random.rand(2)

array([0.89470033, 0.76428743])

In [4]:
df = pd.read_csv("winemag-data-130k-v2.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
nombres_columnas = df.columns.tolist()

print(nombres_columnas)


['id', 'country', 'description', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery']


In [6]:
nan_count = df.isna().sum()

print(nan_count)


id                           0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64


In [7]:
nombres_columnas.remove("price")
nombres_columnas.remove("points")
nombres_columnas.remove("country")

In [8]:
nombres_columnas

['id',
 'description',
 'designation',
 'province',
 'region_1',
 'region_2',
 'taster_name',
 'taster_twitter_handle',
 'title',
 'variety',
 'winery']

In [9]:
df[nombres_columnas] = df[nombres_columnas].fillna("")
df['price'] = df['price'].fillna(0)
df['country'] = df['country'].fillna("unknown")

In [10]:
df.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,0.0,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [11]:
df.dtypes

id                         int64
country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

In [12]:
text_columns = df.select_dtypes(include=['object'])
df['text'] = text_columns.apply(lambda row: ' '.join(row.astype(str)), axis=1)
df['text'] = df['text'].apply(lambda x: str(x).replace(',', ''))
df['text'] = df['text'].str.lower()
df['text']

0         italy aromas include tropical fruit broom brim...
1         portugal this is ripe and fruity a wine that i...
2         us tart and snappy the flavors of lime flesh a...
3         us pineapple rind lemon pith and orange blosso...
4         us much like the regular bottling from 2012 th...
                                ...                        
129966    germany notes of honeysuckle and cantaloupe sw...
129967    us citation is given as much as a decade of bo...
129968    france well-drained gravel soil gives this win...
129969    france a dry style of pinot gris this is crisp...
129970    france big rich and off-dry this is powered by...
Name: text, Length: 129971, dtype: object

In [13]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [14]:
def tokenize(text):
    tokens = word_tokenize(text, language="english")
    return [word for word in tokens if word.isalpha()]

def filter_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

In [15]:
df['tokens'] = df['text'].apply(tokenize)
df['filtered_tokens'] = df['tokens'].apply(filter_stopwords)
df['stemmed_tokens'] = df['filtered_tokens'].apply(stem_words)

In [16]:
df[['text', 'tokens', 'filtered_tokens', 'stemmed_tokens']]

Unnamed: 0,text,tokens,filtered_tokens,stemmed_tokens
0,italy aromas include tropical fruit broom brim...,"[italy, aromas, include, tropical, fruit, broo...","[italy, aromas, include, tropical, fruit, broo...","[itali, aroma, includ, tropic, fruit, broom, b..."
1,portugal this is ripe and fruity a wine that i...,"[portugal, this, is, ripe, and, fruity, a, win...","[portugal, ripe, fruity, wine, smooth, still, ...","[portug, ripe, fruiti, wine, smooth, still, st..."
2,us tart and snappy the flavors of lime flesh a...,"[us, tart, and, snappy, the, flavors, of, lime...","[us, tart, snappy, flavors, lime, flesh, rind,...","[us, tart, snappi, flavor, lime, flesh, rind, ..."
3,us pineapple rind lemon pith and orange blosso...,"[us, pineapple, rind, lemon, pith, and, orange...","[us, pineapple, rind, lemon, pith, orange, blo...","[us, pineappl, rind, lemon, pith, orang, bloss..."
4,us much like the regular bottling from 2012 th...,"[us, much, like, the, regular, bottling, from,...","[us, much, like, regular, bottling, comes, acr...","[us, much, like, regular, bottl, come, across,..."
...,...,...,...,...
129966,germany notes of honeysuckle and cantaloupe sw...,"[germany, notes, of, honeysuckle, and, cantalo...","[germany, notes, honeysuckle, cantaloupe, swee...","[germani, note, honeysuckl, cantaloup, sweeten..."
129967,us citation is given as much as a decade of bo...,"[us, citation, is, given, as, much, as, a, dec...","[us, citation, given, much, decade, bottle, ag...","[us, citat, given, much, decad, bottl, age, pr..."
129968,france well-drained gravel soil gives this win...,"[france, gravel, soil, gives, this, wine, its,...","[france, gravel, soil, gives, wine, crisp, dry...","[franc, gravel, soil, give, wine, crisp, dri, ..."
129969,france a dry style of pinot gris this is crisp...,"[france, a, dry, style, of, pinot, gris, this,...","[france, dry, style, pinot, gris, crisp, acidi...","[franc, dri, style, pinot, gris, crisp, acid, ..."


In [17]:
df.to_csv('preprocess_data.csv', index=False)