In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn import feature_extraction, linear_model, model_selection, preprocessing
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Tweet Preprocessing

Since we are dealing with tweets in this competation, we need to do specific tweet text cleaning along with normal text pre-processing. A tweet may contains
* URL's
* Mentions
* Hashtags
* Emojis
* Specific words etc.

To clean the tweet, we can use a python library tweet-preprocessor instead of writing the cleaning logic ourself.

# Reading Datasets

In [2]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv") 

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [6]:
train_df.count()

id          7613
keyword     7552
location    5080
text        7613
target      7613
dtype: int64

# Droppping duplicates and NaN from the DataFrame.

In [7]:
train_df = train_df.dropna()
train_df = train_df.drop_duplicates()

In [8]:
train_df.count()

id          5080
keyword     5080
location    5080
text        5080
target      5080
dtype: int64

In [9]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


# Applying Tweet Processing

Apply tweet preprocessing first. Define a process function and use pandas to apply it on each value of 'text'

In [10]:
import preprocessor as p

def preprocess_tweet(row):
    text = row['text']
    text = p.clean(text)
    return text

In [11]:
train_df['text'] = train_df.apply(preprocess_tweet, axis=1)

### Tweet has been cleaned to normal text.

In [12]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,Wholesale Markets ablaze,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy.,0
33,50,ablaze,AFRICA,: Breaking news:Nigeria flag set ablaze in Aba.,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


# Normal Preprocessing

### Now We can apply bormal text preprocessing like
* Lowercasing
* Punctuation Removal
* Replace Extra white Spaces
* Stopwords removal

In [13]:
from gensim.parsing.preprocessing import remove_stopwords

def stopword_removal(row):
    text = row['text']
    text = remove_stopwords(text)
    return text

In [14]:
train_df['text'] = train_df.apply(stopword_removal, axis = 1)

In [15]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,Wholesale Markets ablaze,1
32,49,ablaze,Est. September 2012 - Bristol,We try bring heavy.,0
33,50,ablaze,AFRICA,: Breaking news:Nigeria flag set ablaze Aba.,1
34,52,ablaze,"Philadelphia, PA",Crying more! Set ablaze,0
35,53,ablaze,"London, UK",On plus LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE,0


## Remove Extra white spaces, punctuation and apply lower casing

In [16]:
train_df['text'] = train_df['text'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')

  """Entry point for launching an IPython kernel.


In [17]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,wholesale markets ablaze,1
32,49,ablaze,Est. September 2012 - Bristol,we try bring heavy,0
33,50,ablaze,AFRICA,breaking news nigeria flag set ablaze aba,1
34,52,ablaze,"Philadelphia, PA",crying more set ablaze,0
35,53,ablaze,"London, UK",on plus look at the sky last night it was ablaze,0


### Now input tweet has been pre-processed and lets find features.

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tweets = train_df['text']
vectorizer = TfidfVectorizer(stop_words='english')

# Learn vocabulary from sentences. 
vectorizer.fit(tweets)

# Get vocabularies.
vectorizer.vocabulary_

{'wholesale': 10137,
 'markets': 5661,
 'ablaze': 157,
 'try': 9567,
 'bring': 1233,
 'heavy': 4241,
 'breaking': 1210,
 'news': 6246,
 'nigeria': 6264,
 'flag': 3504,
 'set': 8187,
 'aba': 143,
 'crying': 2184,
 'plus': 6976,
 'look': 5464,
 'sky': 8421,
 'night': 6270,
 've': 9839,
 'built': 1300,
 'hype': 4492,
 'new': 6238,
 'acquisitions': 197,
 'doubt': 2720,
 'epl': 3079,
 'season': 8097,
 'inec': 4660,
 'office': 6435,
 'abia': 154,
 'lord': 5478,
 'check': 1622,
 'awesome': 741,
 'time': 9334,
 'visiting': 9918,
 'cfc': 1564,
 'head': 4206,
 'ancop': 448,
 'site': 8389,
 'thanks': 9247,
 'tita': 9355,
 'vida': 9882,
 'taking': 9096,
 'care': 1466,
 'west': 10094,
 'burned': 1322,
 'thousands': 9289,
 'wildfires': 10155,
 'california': 1395,
 'retainers': 7690,
 'weird': 10084,
 'better': 972,
 'wear': 10057,
 'single': 8369,
 'year': 10344,
 'deputies': 2452,
 'man': 5607,
 'shot': 8293,
 'brighton': 1231,
 'home': 4363,
 'wife': 10149,
 'years': 10345,
 'jail': 4868,
 'settin

In [19]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

stopwords = stopwords.words('english')

print(stopwords)

count_vector = CountVectorizer(token_pattern = r'\w{1,}', ngram_range = (1, 2), stop_words = stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [20]:
from sklearn.model_selection import train_test_split
X = train_df.text
y = train_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = LogisticRegression()
pipe = Pipeline([
    ('count_vector', CountVectorizer()),
    ('clf', LogisticRegression())
])
pipe.fit(X_train, y_train)

Pipeline(steps=[('count_vector', CountVectorizer()),
                ('clf', LogisticRegression())])

In [22]:
from sklearn import metrics
predicted = pipe.predict(X_test)

In [23]:
print("accuracy :", metrics.accuracy_score(predicted, y_test))

accuracy : 0.7874015748031497


# Storing Result

In [24]:
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
submission['target'] = pipe.predict(test_df.text)
submission.to_csv("submission.csv", index = False)

## Final Result

In [25]:
submission.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0
