In [178]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [179]:
df = pd.read_csv('train.csv')
df.shape

(7613, 5)

In [180]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [181]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [182]:
#df.dropna(inplace=True)
#df.drop(['id'], axis='columns', inplace=True)
df.drop(['id', 'keyword', 'location'], axis='columns', inplace=True)
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [183]:
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, df['target'].values, random_state=42)

In [184]:
models = {
    'Multinomial': {
        'model': MultinomialNB(),
        'params': {
            'multinomialnb__alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 3, 4, 5]
        }
    },
    'Complement': {
        'model': ComplementNB(),
        'params': {
            'complementnb__alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 3, 4, 5]
        }
    }
}

In [185]:
scores = []
best_estimators = {}

for mn, mp in models.items():
    pipe = make_pipeline(CountVectorizer(), mp['model'])
    gscv = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    gscv.fit(X_train, y_train)

    scores.append({
        'Model': mn,
        'Best Params': gscv.best_params_,
        'Best Score': gscv.best_score_
    })
    best_estimators[mn] = gscv.best_estimator_

pd.DataFrame(scores, columns=['Model', 'Best Params', 'Best Score'])

Unnamed: 0,Model,Best Params,Best Score
0,Multinomial,{'multinomialnb__alpha': 1},0.794709
1,Complement,{'complementnb__alpha': 2},0.791381


In [186]:
best_estimators

{'Multinomial': Pipeline(steps=[('countvectorizer', CountVectorizer()),
                 ('multinomialnb', MultinomialNB(alpha=1))]),
 'Complement': Pipeline(steps=[('countvectorizer', CountVectorizer()),
                 ('complementnb', ComplementNB(alpha=2))])}

In [187]:
best_estimators['Multinomial'].score(X_test, y_test)

0.8082983193277311

In [188]:
best_estimators['Complement'].score(X_test, y_test)

0.8025210084033614

In [189]:
best_model = best_estimators['Multinomial']

In [190]:
df_test = pd.read_csv('test.csv')
df_test.shape

(3263, 4)

In [191]:
df_test.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [192]:
df_output = pd.DataFrame()
df_output['id'] = df_test['id']

In [193]:
df_test.drop(['id', 'keyword', 'location'], axis='columns', inplace=True)

In [196]:
df_output['target'] = best_model.predict(df_test['text'])
df_output.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [199]:
df_output.to_csv('multinomial-output.csv', index=False)