In [1]:
import pandas as pd

df = pd.read_csv('train.csv', header='infer')

In [2]:
df.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [3]:
df = df.drop(['id', 'keyword', 'location'], axis=1)

In [4]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

from nltk.stem.porter import PorterStemmer
import re

porter = PorterStemmer()

def clean(text):
    text = re.sub('<[^>]*>', '', text)
    emots = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          text)
    text = (re.sub('[\W]+', ' ', text.lower())+' '.join(
    emots).replace('-', ''))
    return text

def tokenizer1(text):
    return clean(text).split()

def tokens(text):
    return [porter.stem(w) for w in text.split()]

def tokenizer2(text):
    return [w for w in tokens(text) if w not in stop]


In [5]:
import numpy as np

#df['text'] = df['text'].apply(clean)
#np.random.seed(0)
#df = df.reindex(np.random.permutation(df.index))
#index_coup = df.shape[0] - int(df.shape[0]*0.3)
#X_train, y_train = df.loc[: index_coup, 'text'].values\
#, df.loc[:index_coup, 'target'].values
#X_test, y_test = df.loc[index_coup:, 'text'].values\
#, df.loc[index_coup:, 'target'].values

X, y = df['text'], df['target']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=1,
stratify=y)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False,
                       preprocessor=None)

param_grid = [{'vect__ngram_range':[(1, 1)], 
                                   'vect__stop_words':
                                   [stop, None],
                                   'vect__tokenizer':
                                   [tokenizer1, tokenizer2],
                                    'clf__penalty':
                                    ['l1', 'l2'],
                                    'clf__C':[1.0, 10., 100.0]},
              {'vect__ngram_range':[(1, 1)], 
                                   'vect__stop_words':
                                   [stop, None],
                                   'vect__tokenizer':
                                   [tokenizer1, tokenizer2],
                                    'vect__use_idf':[False],
                                    'vect__norm':[None],
                                    'clf__penalty':
                                    ['l1', 'l2'],
                                    'clf__C':[1.0, 10., 100.0]},
              {'vect__ngram_range':[(2, 2)], 
                                   'vect__stop_words':
                                   [stop, None],
                                   'vect__tokenizer':
                                   [tokenizer1, tokenizer2],
                                    'vect__use_idf':[False],
                                    'vect__norm':[None],
                                    'clf__penalty':
                                    ['l1', 'l2'],
                                    'clf__C':[1.0, 10., 100.0]}
                                                         ]

lr_tfidf = Pipeline([('vect', tfidf),
                    ('clf', LogisticRegression(random_state=0,
                                    solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring='accuracy', cv=5, verbose=2,
                          n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  3.0min finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [7]:
gs_lr_tfidf.best_params_

{'clf__C': 1.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 2),
 'vect__norm': None,
 'vect__stop_words': ['i',
  'me',
  'my',
  'myself',
  'we',
  'our',
  'ours',
  'ourselves',
  'you',
  "you're",
  "you've",
  "you'll",
  "you'd",
  'your',
  'yours',
  'yourself',
  'yourselves',
  'he',
  'him',
  'his',
  'himself',
  'she',
  "she's",
  'her',
  'hers',
  'herself',
  'it',
  "it's",
  'its',
  'itself',
  'they',
  'them',
  'their',
  'theirs',
  'themselves',
  'what',
  'which',
  'who',
  'whom',
  'this',
  'that',
  "that'll",
  'these',
  'those',
  'am',
  'is',
  'are',
  'was',
  'were',
  'be',
  'been',
  'being',
  'have',
  'has',
  'had',
  'having',
  'do',
  'does',
  'did',
  'doing',
  'a',
  'an',
  'the',
  'and',
  'but',
  'if',
  'or',
  'because',
  'as',
  'until',
  'while',
  'of',
  'at',
  'by',
  'for',
  'with',
  'about',
  'against',
  'between',
  'into',
  'through',
  'during',
  'before',
  'after',
  'above',
  'below',
  'to',
  

In [8]:
clf = gs_lr_tfidf.best_estimator_
print('Test set accuracy : %.3f' % clf.score(X_test, y_test))

Test set accuracy : 0.781


In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import precision_score

y_pred = clf.predict(X_test)
print('Precision : %.3f' % precision_score(y_true=y_test,
                                    y_pred=y_pred))
print('Recall : %.3f' % recall_score(y_true=y_test,
                                 y_pred=y_pred))
print('F1 : %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

Precision : 0.821
Recall : 0.627
F1 : 0.711


In [302]:
test = pd.read_csv('test.csv', header='infer')

In [303]:
test_id = test['id']
test = test.drop(['id', 'keyword', 'location'], axis=1)

In [304]:
predict_test = [[idx,clf.predict([text])[0]] for idx, text in zip(test_id, test['text'])]

predict_test_fram = pd.DataFrame(predict_test, columns=['id', 'target'])

predict_test_fram.to_csv('predict_test_fram.csv', index=None)

In [182]:
#from sklearn.metrics import confusion_matrix
#from sklearn.metrics import recall_score, f1_score
#from sklearn.metrics import precision_score

y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[884,  85],
       [159, 598]])

In [287]:
print(f1_score(y_true=y_test, y_pred=y_test_pred))

print(precision_score(y_true=y_test, y_pred=y_test_pred))

print(recall_score(y_true=y_test, y_pred=y_test_pred))

0.7503242542153049
0.7990331491712708
0.7072127139364304
