In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('train.csv', header='infer')

In [5]:
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import model_selection,\
linear_model, naive_bayes, metrics
import re


stop = stopwords.words('english')
stm = PorterStemmer()

def cleaner(txt):
    #Cleaning data
    txt = re.sub('<[^>]*>', '', txt)
    emots = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          txt)
    txt = (re.sub('[\W]+', ' ', txt.lower())+' '.join(
    emots).replace('-', ''))
    #Stemmatization and lemmatization
    L = [stm.stem(w) for w in txt.split() if w not in stop]
    return ' '.join([Word(mot).lemmatize() for mot in L])



In [6]:
df['text'] = df['text'].apply(cleaner)

In [7]:
df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [8]:
df = df.drop(['id', 'keyword', 'location'], axis=1)

In [9]:
X, y = df['text'], df['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=1)

In [11]:
#TFIDF 5000 features générées au maximun
tfidf = TfidfVectorizer(analyzer='word',
                token_pattern=r'\w{1,}', max_features=5000)
tfidf.fit(X)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [12]:
X_train_tfidf.data

array([0.51612381, 0.21437958, 0.05893993, ..., 0.07588667, 0.26761377,
       0.31747461])

In [13]:
def clf_training(classifier, train_X, train_y, test_X,
                test_y, is_neaural_net=False):
    
    classifier.fit(train_X, train_y)
    y_pred = classifier.predict(test_X)
    return {'Accuracy':metrics.accuracy_score(y_true=test_y,
                                              y_pred=y_pred),
           'Precision':metrics.precision_score(y_true=test_y,
                                              y_pred=y_pred),
           'Recall':metrics.recall_score(y_true=test_y,
                                              y_pred=y_pred),
           'F1':metrics.f1_score(y_true=test_y,
                                              y_pred=y_pred)}

In [58]:
from sklearn.svm import SVC

naivBay = naive_bayes.MultinomialNB(alpha=0.6)
lr = linear_model.LogisticRegression(C=1., solver='liblinear',
                                    random_state=1,
                                     penalty='l2')
svm = SVC()

predNaivBay = clf_training(naivBay, X_train_tfidf,
                          y_train, X_test_tfidf, y_test)

predLR = clf_training(lr, X_train_tfidf,
                          y_train, X_test_tfidf, y_test)

predSVM = clf_training(svm, X_train_tfidf,
                          y_train, X_test_tfidf, y_test)

print('Naive Baye : \n', predNaivBay, '\n')
print('Logistic :\n', predLR, '\n')
print('SVM : \n', predSVM, '\n')

Naive Baye : 
 {'Accuracy': 0.8010505581089954, 'Precision': 0.8256029684601113, 'Recall': 0.6804281345565749, 'F1': 0.7460184409052807} 

Logistic :
 {'Accuracy': 0.8049901510177282, 'Precision': 0.8426103646833013, 'Recall': 0.6712538226299695, 'F1': 0.7472340425531916} 

SVM : 
 {'Accuracy': 0.804333552199606, 'Precision': 0.8662551440329218, 'Recall': 0.6437308868501529, 'F1': 0.7385964912280701} 



In [52]:
from sklearn.ensemble import AdaBoostClassifier

adaNaiv = AdaBoostClassifier(base_estimator=naivBay,
                         n_estimators=5000, learning_rate=0.1,
                         random_state=1)
adaLR = AdaBoostClassifier(base_estimator=lr,
                         n_estimators=5000, learning_rate=0.1,
                         random_state=1)
adaSVM = AdaBoostClassifier(base_estimator=svm,
                         n_estimators=5000, learning_rate=0.1,
                         random_state=1)

predAdaNaiv = clf_training(adaNaiv, X_train_tfidf,
                          y_train, X_test_tfidf, y_test)
predAdaLR = clf_training(adaLR, X_train_tfidf,
                          y_train, X_test_tfidf, y_test)
predAdaSVM = clf_training(adaLR, X_train_tfidf,
                          y_train, X_test_tfidf, y_test)

print(' AdaBoost Naive Baye : \n', predAdaNaiv, '\n')
print(' AdaBoost Logistic : \n', predAdaLR, '\n')
print(' AdaBoost SVM : \n', predAdaSVM, '\n')

 AdaBoost Naive Baye : 
 {'Accuracy': 0.8023637557452397, 'Precision': 0.8168761220825853, 'Recall': 0.6957186544342507, 'F1': 0.7514450867052023} 

 AdaBoost Logistic : 
 {'Accuracy': 0.7715036112934996, 'Precision': 0.84, 'Recall': 0.5779816513761468, 'F1': 0.6847826086956522} 

 AdaBoost SVL : 
 {'Accuracy': 0.7715036112934996, 'Precision': 0.84, 'Recall': 0.5779816513761468, 'F1': 0.6847826086956522} 



In [19]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=10,
                               random_state=123,
                               learning_method='batch')

X_topic = lda.fit_transform(X_train_tfidf)
lda.components_.shape

(10, 5000)

In [20]:
n_top_words = 5
feature_names = tfidf.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print('Topic %d:'% (topic_idx +1))
    print(" ".join([feature_names[i] for i in topic.argsort()\
                   [:-n_top_words-1:-1]]))

Topic 1:
http co migrant drown rescuer
Topic 2:
co http wildfir fatal attack
Topic 3:
http co bag bodi suicid
Topic 4:
co http reddit quarantin content
Topic 5:
co http fire bomb boy
Topic 6:
co http disast obama typhoon
Topic 7:
http co scream im crush
Topic 8:
co http wreck thunder one
Topic 9:
http co crash mh370 confirm
Topic 10:
co http loud trap wind


In [44]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

pipe_naivBay = make_pipeline(TfidfVectorizer(analyzer='word',
                token_pattern=r'\w{1,}', max_features=5000),
    #StandardScaler(),
                          #LatentDirichletAllocation(
                           #   n_components=100,
                            #   random_state=123,
                             #  learning_method='batch'),
                          naive_bayes.MultinomialNB(alpha=0.2)
                             #LogisticRegression(random_state=1,
                              #                 solver='lbfgs')
                            )

pipe_naivBay.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=5000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False, token_pattern='\\w{1,}',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('multinomialnb',
                 MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))],
         verbose=False)

In [45]:
y_pred = pipe_naivBay.predict(X_test)

In [46]:
print('Test Accuracy : %.3f' % pipe_naivBay.score(
X_test, y_test))

Test Accuracy : 0.800


In [47]:
from sklearn.metrics import f1_score

print('F1 %.3f '% f1_score(y_test, y_pred))

F1 0.747 


In [48]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=pipe_naivBay, X=X_train,
               y=y_train, cv=10, n_jobs=-1)

print('CV accuracy scores : %s' % scores)

CV accuracy scores : [0.7816092  0.79967159 0.77175698 0.80788177 0.80131363 0.80131363
 0.82101806 0.81444992 0.79802956 0.7865353 ]


In [49]:
import numpy as np

print('CV accuracy : %.3f +/- %.3f' % (np.mean(scores),
                                      np.std(scores)))

CV accuracy : 0.798 +/- 0.014


In [68]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False,
                       preprocessor=None)

param_grid = [{'clf__alpha':np.linspace(0.1, 1, 10)}]

nb_tfidf = Pipeline([('vect', tfidf),
                    ('clf', MultinomialNB())])

gs_nb_tfidf = GridSearchCV(nb_tfidf, param_grid,
                          scoring='accuracy', cv=10, verbose=2,
                          n_jobs=-1)

gs_nb_tfidf.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.0s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        

In [69]:
print(gs_nb_tfidf.best_params_)

{'clf__alpha': 0.6}


In [70]:
nb = gs_nb_tfidf.best_estimator_

print('F1 : %.3f' % nb.score(X_test, y_test))
y_pred = nb.predict(X_test)
print(f1_score(y_true=y_test, y_pred=y_pred))

F1 : 0.804
0.74466268146883


In [65]:
test = pd.read_csv('test.csv', header='infer')

test_id = test['id']
test = test.drop(['id', 'keyword', 'location'], axis=1)
predict_test = [[idx,nb.predict([text])[0]] for idx, \
                text in zip(test_id, test['text'])]

predict_test_fram = pd.DataFrame(predict_test,
                                 columns=['id', 'target'])

predict_test_fram.to_csv('predict_test_fram.csv', index=None)