In [1]:
import numpy as np
import pandas as pd
import time
import re
import string
import nltk
from nltk.corpus import stopwords

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv


In [2]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [3]:
%%time
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and training datasets
train['text'] = train['text'].apply(lambda x: clean_text(x))
test['text'] = test['text'].apply(lambda x: clean_text(x))

CPU times: user 650 ms, sys: 1.13 ms, total: 651 ms
Wall time: 654 ms


In [4]:
%%time
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
train['text'] = train['text'].apply(lambda x: tokenizer.tokenize(x))
test['text'] = test['text'].apply(lambda x: tokenizer.tokenize(x))

CPU times: user 85.7 ms, sys: 5.11 ms, total: 90.9 ms
Wall time: 89.7 ms


In [5]:
%%time
def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words


train['text'] = train['text'].apply(lambda x : remove_stopwords(x))
test['text'] = test['text'].apply(lambda x : remove_stopwords(x))

CPU times: user 21.8 s, sys: 2.04 s, total: 23.8 s
Wall time: 23.8 s


In [6]:
%%time
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

train['text'] = train['text'].apply(lambda x : combine_text(x))
test['text'] = test['text'].apply(lambda x : combine_text(x))

CPU times: user 14.4 ms, sys: 1.81 ms, total: 16.2 ms
Wall time: 15.8 ms


In [7]:
lemmatizer=nltk.stem.WordNetLemmatizer()

train['text'] = train['text'].apply(lambda x: lemmatizer.lemmatize(x))
test['text'] = test['text'].apply(lambda x: lemmatizer.lemmatize(x))

In [8]:
X_train = train['text']
y_train = train['target']

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 

In [10]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2), norm='l2')

svc_clf = Pipeline([('tfidf', vectorizer),
                      ('svc_clf', LinearSVC())])


lr_clf = Pipeline([('tfidf', vectorizer),
                      ('lr_clf', LogisticRegression())])

In [11]:
svc_clf.fit(X_train,y_train)
lr_clf.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=None,
                                 min_df=2, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr_clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scal

In [12]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,happened terrible car crash
1,2,,,heard earthquake different cities stay safe ev...
2,3,,,forest fire spot pond geese fleeing across str...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills china taiwan
...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety fasten...
3259,10865,,,storm ri worse last hurricane hardest hit yard...
3260,10868,,,green line derailment chicago
3261,10874,,,meg issues hazardous weather outlook hwo


In [13]:
svc_pred = svc_clf.predict(test['text'])
lr_pred = lr_clf.predict(test['text'])

In [14]:
submission_svc_pred=pd.DataFrame({"id":sub['id'],"target":svc_pred})
submission_lr_pred=pd.DataFrame({"id":sub['id'],"target":lr_pred})

In [15]:
submission_svc_pred.to_csv("submission_svc.csv",index=False)
submission_lr_pred.to_csv("submission_lr.csv",index=False)