In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
import os
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [5]:
df = pd.read_csv('../../data/dfs_variety/Gamay.csv', index_col = 0)

In [6]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [7]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)


In [30]:
df = pd.read_csv('../data/dfs_variety/Gamay.csv', index_col=0)
corpus = df['description']
bags_fit = bow.fit(corpus)
bags_transform = bags_fit.transform(corpus)
sum_words = bags_transform.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in bags_fit.vocabulary_.items()]
words_freq_sort = sorted(words_freq, key = lambda x: x[1], reverse=True)
word_count = pd.DataFrame(words_freq_sort, columns=['word', 'word count'])
word_count.to_csv('Gamayw.csv')
#print(word_count)



In [39]:
wine_variety_list = ['Pinot Noir.csv', 'Chardonnay.csv', 'Cabernet Sauvignon.csv', 'Red Blend.csv',
       'Bordeaux-style Red Blend.csv', 'Riesling.csv', 'Sauvignon Blanc.csv', 'Syrah.csv',
       'Rosé.csv', 'Merlot.csv', 'Nebbiolo.csv', 'Zinfandel.csv', 'Sangiovese.csv', 'Malbec.csv',
       'Portuguese Red.csv', 'White Blend.csv', 'Sparkling Blend.csv', 'Tempranillo.csv',
       'Rhône-style Red Blend.csv', 'Pinot Gris.csv', 'Champagne Blend.csv',
       'Cabernet Franc.csv', 'Grüner Veltliner.csv', 'Portuguese White.csv',
       'Bordeaux-style White Blend.csv', 'Pinot Grigio.csv', 'Gamay.csv', 'Gewürztraminer.csv',
       'Viognier.csv', 'Shiraz.csv', 'Petite Sirah.csv', 'Sangiovese Grosso.csv', 'Barbera.csv',
       'Port.csv', 'Grenache.csv', 'Corvina, Rondinella, Molinara.csv',
       'Chenin Blanc.csv', 'Tempranillo Blend.csv', 'Carmenère.csv.csv']

In [8]:
wine_variety_list = ['Viognier.csv','Barbera.csv','Port.csv','Grenache.csv','Corvina, Rondinella, Molinara.csv']

In [None]:
wine_variety_list = ['Gamay.csv'

In [9]:
for variety in wine_variety_list:
    df = pd.read_csv('../../data/dfs_variety/'+variety, index_col=0)
    corpus = df['description']
    bags_fit = bow.fit(corpus)
    bags_transform = bags_fit.transform(corpus)
    sum_words = bags_transform.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in bags_fit.vocabulary_.items()]
    words_freq_sort = sorted(words_freq, key = lambda x: x[1], reverse=True)
    word_count = pd.DataFrame(words_freq_sort, columns=['word', 'word count'])
    word_count.to_csv('wc_'+variety)



In [None]:
wc_Pinot Gris.csv

In [40]:
pg = pd.read_csv('wc_Pinot Gris.csv', index_col = 0)

In [41]:
pg

Unnamed: 0,word,word count
0,flavor,854
1,wine,851
2,pear,808
3,fruit,790
4,finish,515
...,...,...
377,fluid,15
378,brighten,15
379,lip,15
380,crisply,15
