In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn import metrics

In [4]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [5]:
df = pd.read_csv('winemag-data-130k-v2.csv', index_col = 0)

In [6]:
df.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery'],
      dtype='object')

In [7]:
df

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef)
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss


In [8]:
df.shape

(129971, 13)

In [9]:
df['variety'].value_counts().head(40)

Pinot Noir                       13272
Chardonnay                       11753
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Riesling                          5189
Sauvignon Blanc                   4967
Syrah                             4142
Rosé                              3564
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
White Blend                       2360
Sparkling Blend                   2153
Tempranillo                       1810
Rhône-style Red Blend             1471
Pinot Gris                        1455
Champagne Blend                   1396
Cabernet Franc                    1353
Grüner Veltliner                  1345
Portuguese White                  1159
Bordeaux-style White Blend        1066
Pinot Grigio             

In [10]:
df['single_variety'] = df['variety'].value_counts()==1

In [11]:
df.variety.value_counts().loc[lambda x: x>500]

Pinot Noir                       13272
Chardonnay                       11753
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Riesling                          5189
Sauvignon Blanc                   4967
Syrah                             4142
Rosé                              3564
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
White Blend                       2360
Sparkling Blend                   2153
Tempranillo                       1810
Rhône-style Red Blend             1471
Pinot Gris                        1455
Champagne Blend                   1396
Cabernet Franc                    1353
Grüner Veltliner                  1345
Portuguese White                  1159
Bordeaux-style White Blend        1066
Pinot Grigio             

In [12]:
counts = df['variety'].value_counts()

In [13]:
counts[counts > 500]

Pinot Noir                       13272
Chardonnay                       11753
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Riesling                          5189
Sauvignon Blanc                   4967
Syrah                             4142
Rosé                              3564
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
White Blend                       2360
Sparkling Blend                   2153
Tempranillo                       1810
Rhône-style Red Blend             1471
Pinot Gris                        1455
Champagne Blend                   1396
Cabernet Franc                    1353
Grüner Veltliner                  1345
Portuguese White                  1159
Bordeaux-style White Blend        1066
Pinot Grigio             

In [14]:
df_trim = df[df['variety'].isin(counts[counts > 500].index)]

In [15]:
df_trim

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,single_variety
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef),
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation,
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser,
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss,


In [16]:
df_trim['type'] = 'red'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
df_trim.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,single_variety,type
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,,red
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,,red
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,,red
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,,red
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,,red


In [25]:
df_trim.loc[df.variety == 'White Blend', 'type'] = "white"
df_trim.loc[df.variety == 'Chardonnay', 'type'] = "white"
df_trim.loc[df.variety == 'Riesling', 'type'] = "white"
df_trim.loc[df.variety == 'Sauvignon Blanc', 'type'] = "white"
df_trim.loc[df.variety == 'Sparkling Blend', 'type'] = "white"
df_trim.loc[df.variety == 'Pinot Gris', 'type'] = "white"
df_trim.loc[df.variety == 'Champagne Blend', 'type'] = "white"
df_trim.loc[df.variety == 'Grüner Veltliner ', 'type'] = "white"
df_trim.loc[df.variety == 'Portuguese White', 'type'] = "white"
df_trim.loc[df.variety == 'Bordeaux-style White Blend', 'type'] = "white"
df_trim.loc[df.variety == 'Pinot Grigio', 'type'] = "white"
df_trim.loc[df.variety == 'Gewürztraminer', 'type'] = "white"
df_trim.loc[df.variety == 'Viognier', 'type'] = "white"
df_trim.loc[df.variety == 'Glera', 'type'] = "white"
df_trim.loc[df.variety == 'Chenin Blanc', 'type'] = "white"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [26]:
df_trim.loc[df.variety == 'Rosé', 'type'] = "rose"

In [27]:
df_trim.loc[df.variety == 'Chenin Blanc']

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,single_variety,type
29,US,Clarksburg is becoming a haven for Chenin Blan...,,86,16.0,California,Clarksburg,Central Valley,Virginie Boone,@vboone,Clarksburg Wine Company 2010 Chenin Blanc (Cla...,Chenin Blanc,Clarksburg Wine Company,,white
137,South Africa,"This is great Chenin Blanc, wood fermented but...",Hope Marguerite,90,,Walker Bay,,,Roger Voss,@vossroger,Beaumont 2005 Hope Marguerite Chenin Blanc (Wa...,Chenin Blanc,Beaumont,,white
197,South Africa,"This is a concentrated, fairly full and lush C...",21 Gables,90,23.0,Western Cape,,,Lauren Buzzeo,@laurbuzz,Spier 2014 21 Gables Chenin Blanc (Western Cape),Chenin Blanc,Spier,,white
226,South Africa,"This is a round and rich wine, with notes of w...",,90,20.0,Stellenbosch,,,Lauren Buzzeo,@laurbuzz,Oldenburg 2014 Chenin Blanc (Stellenbosch),Chenin Blanc,Oldenburg,,white
483,South Africa,A glimmer of toast and honey add depth to the ...,,87,12.0,Stellenbosch,,,Lauren Buzzeo,@laurbuzz,Cape View 2011 Chenin Blanc (Stellenbosch),Chenin Blanc,Cape View,,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128992,South Africa,This luminous and unique Chenin Blanc straw wi...,Straw Wine,93,50.0,Stellenbosch,,,Lauren Buzzeo,@laurbuzz,de Trafford 2008 Straw Wine Chenin Blanc (Stel...,Chenin Blanc,de Trafford,,white
129089,US,The abundant minerality of this Mendocino Chen...,,87,15.0,California,Mendocino County,,Virginie Boone,@vboone,Graziano 2011 Chenin Blanc (Mendocino County),Chenin Blanc,Graziano,,white
129207,South Africa,There's a subtle honeyed edge to this otherwis...,,86,10.0,Coastal Region,,,Lauren Buzzeo,@laurbuzz,MAN Vintners 2010 Chenin Blanc (Coastal Region),Chenin Blanc,MAN Vintners,,white
129828,South Africa,"This wine has a slightly quiet and tight nose,...",,86,9.0,Robertson,,,Susan Kostrzewa,@suskostrzewa,Robertson Winery 2008 Chenin Blanc (Robertson),Chenin Blanc,Robertson Winery,,white


In [28]:
df_trim

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,single_variety,type
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,,white
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,,red
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,,white
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,,white
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Anna Lee C. Iijima,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef),,white
129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Paul Gregutt,@paulgwine,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation,,red
129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser,,white
129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Gris,Domaine Marcel Deiss,,white


In [31]:
corpus = df_trim['description']

In [32]:
corpus

0         Aromas include tropical fruit, broom, brimston...
1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 111797, dtype: object

In [33]:
y = df_trim['variety']

In [34]:
y.value_counts()

Pinot Noir                       13272
Chardonnay                       11753
Cabernet Sauvignon                9472
Red Blend                         8946
Bordeaux-style Red Blend          6915
Riesling                          5189
Sauvignon Blanc                   4967
Syrah                             4142
Rosé                              3564
Merlot                            3102
Nebbiolo                          2804
Zinfandel                         2714
Sangiovese                        2707
Malbec                            2652
Portuguese Red                    2466
White Blend                       2360
Sparkling Blend                   2153
Tempranillo                       1810
Rhône-style Red Blend             1471
Pinot Gris                        1455
Champagne Blend                   1396
Cabernet Franc                    1353
Grüner Veltliner                  1345
Portuguese White                  1159
Bordeaux-style White Blend        1066
Pinot Grigio             

In [36]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [37]:
bow = CountVectorizer(tokenizer=custom_tokenizer, 
                      ngram_range=(1, 1), 
                      min_df=0.01, 
                      max_df=0.99)

In [38]:
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, 
                                                              test_size=0.4, 
                                                              train_size=0.6)

In [39]:
X_train = bow.fit_transform(corpus_train)

In [41]:
X_train.shape

(67078, 397)

In [42]:
X_test = bow.transform(corpus_test)

In [43]:
m_nb = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', BernoulliNB())
])

In [44]:
cross_val_score(m_nb, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()

0.5015058364539758

In [36]:
m_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('TfIdf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('NB',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

In [38]:
m_nb.predict(X_test)

array(['Pinot Noir', 'Cabernet Sauvignon', 'Bordeaux-style Red Blend',
       ..., 'Red Blend', 'Portuguese White', 'Chardonnay'], dtype='<U35')

In [40]:
#metrics.accuracy_score(y_test, m_nb.predict(X_test))

In [42]:
# get the NaiveBayes form the pipeline and return marginal probabilites P(X|Y=k)
feat_imp = m_nb['NB'].feature_log_prob_
feat_imp.shape

(637, 393)

In [43]:
feat_imp = pd.DataFrame(feat_imp, columns=bow.get_feature_names()).T
np.exp(feat_imp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,627,628,629,630,631,632,633,634,635,636
accent,0.25,0.024390,0.039024,0.333333,0.2,0.0625,0.333333,0.017857,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.034056,0.333333,0.014493,0.333333,0.333333
acid,0.25,0.097561,0.004878,0.333333,0.2,0.0625,0.333333,0.053571,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.020433,0.333333,0.014493,0.333333,0.333333
acidic,0.25,0.024390,0.004878,0.333333,0.2,0.0625,0.333333,0.028571,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.4,0.006811,0.666667,0.021739,0.333333,0.333333
acidity,0.50,0.121951,0.107317,0.333333,0.2,0.7500,0.333333,0.253571,0.25,0.25,...,0.25,0.333333,0.363636,0.500000,0.2,0.115789,0.333333,0.275362,0.333333,0.333333
add,0.25,0.048780,0.039024,0.333333,0.2,0.0625,0.333333,0.007143,0.25,0.25,...,0.25,0.333333,0.090909,0.071429,0.4,0.028483,0.333333,0.050725,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow,0.25,0.024390,0.004878,0.333333,0.2,0.6250,0.666667,0.035714,0.25,0.25,...,0.25,0.333333,0.318182,0.285714,0.2,0.000619,0.666667,0.007246,0.333333,0.333333
young,0.25,0.024390,0.034146,0.333333,0.2,0.0625,0.333333,0.010714,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.014241,0.333333,0.065217,0.333333,0.333333
z,0.25,0.024390,0.019512,0.333333,0.2,0.1250,0.333333,0.010714,0.25,0.25,...,0.25,0.333333,0.045455,0.071429,0.2,0.005573,0.333333,0.007246,0.333333,0.333333
zest,0.25,0.024390,0.009756,0.333333,0.2,0.5000,0.333333,0.007143,0.25,0.25,...,0.25,0.333333,0.409091,0.071429,0.2,0.001858,0.333333,0.007246,0.333333,0.333333


In [58]:
feat_imp.sum(axis=1).sort_values() #find most common words

riesling     -1546.004733
sangiovese   -1543.282498
verdot       -1528.567210
cab          -1527.819434
malbec       -1515.469469
                 ...     
aroma         -695.482612
finish        -677.110674
fruit         -587.061196
flavor        -517.176112
wine          -478.741285
Length: 393, dtype: float64

In [46]:
np.exp(feat_imp[1].sort_values(ascending=False))

wine      0.682927
red       0.658537
flavor    0.609756
cherry    0.609756
nose      0.560976
            ...   
nicely    0.024390
noir      0.024390
nuance    0.024390
oak       0.024390
accent    0.024390
Name: 1, Length: 393, dtype: float64

In [51]:
m_nb.predict_proba(bow.transform(['cherry flavor nice oak accent'])).max()

0.3052741119504553

In [183]:
m_nbg = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', GaussianNB())
])

In [184]:
cross_val_score(m_nbg, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



nan

In [185]:
m_nbm = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', MultinomialNB())
])

In [186]:
cross_val_score(m_nbm, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



0.4104793436297703

In [214]:
m_nbc = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('NB', CategoricalNB())
])

In [215]:
cross_val_score(m_nbc, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()



nan

In [224]:
m_rf = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('RFC', RandomForestClassifier())
])

In [None]:
cross_val_score(m_rf, X_train, y_train, scoring='accuracy',  cv=4).mean()



In [222]:
m_rfr = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('RFC', RandomForestRegressor())
])

In [223]:
cross_val_score(m_rfr, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()

nan

In [None]:
m = Pipeline([
    ('TfIdf', TfidfTransformer()),
    ('LogReg', LogisticRegression(class_weight='balanced'))
    ])

In [None]:
cross_val_score(m, X_train, y_train, scoring='accuracy', n_jobs=4, cv=4).mean()