Uncomment below cell only if you run the code in Google Colaboratory.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import string
import re
from bs4 import BeautifulSoup

In [4]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)
parser = English()

In [5]:
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    tokens = " ".join([i for i in tokens])
    return tokens

def removeurl(raw_text):
    clean_text = re.sub(r'^https?:\/\/.*[\r\n]*', '', raw_text, flags=re.MULTILINE)
    return clean_text

If the notebook is run in Google Colaboratory then the path should be changed accordingly.

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
train["num_words_before_cleaning"] = train["text"].apply(lambda x: len(str(x).split()))
test["num_words_before_cleaning"] = test["text"].apply(lambda x: len(str(x).split()))

print("Mean number of words in train set: " + str(train["num_words_before_cleaning"].mean()))
print("Mean number of words in test set: " + str(test["num_words_before_cleaning"].mean()))
print("Mean number of words in train set for target 0: " + str(train[train.target.eq(0)]["num_words_before_cleaning"].mean()))
print("Mean number of words in train set for target 1: " + str(train[train.target.eq(1)]["num_words_before_cleaning"].mean()))

Mean number of words in train set: 14.903585971364771
Mean number of words in test set: 14.965369292062519
Mean number of words in train set for target 0: 14.704744357438969
Mean number of words in train set for target 1: 15.167532864567411


In [8]:
train["num_unique_words_before_cleaning"] = train["text"].apply(lambda x: len(set(str(x).split())))
test["num_unique_words_before_cleaning"] = test["text"].apply(lambda x: len(set(str(x).split())))

print("Mean number of unique words in train set: " + str(train["num_unique_words_before_cleaning"].mean()))
print("Mean number of unique words in test set: " + str(test["num_unique_words_before_cleaning"].mean()))
print("Mean number of unique words in train set for target 0: " + str(train[train.target.eq(0)]["num_unique_words_before_cleaning"].mean()))
print("Mean number of unique words in train set for target 1: " + str(train[train.target.eq(1)]["num_unique_words_before_cleaning"].mean()))

Mean number of unique words in train set: 14.340732956784448
Mean number of unique words in test set: 14.407293901317805
Mean number of unique words in train set for target 0: 14.09649930907416
Mean number of unique words in train set for target 1: 14.664934270865178


In [9]:
train["num_chars_before_cleaning"] = train["text"].apply(lambda x: len(str(x)))
test["num_chars_before_cleaning"] = test["text"].apply(lambda x: len(str(x)))

print("Mean number of characters in train set: " + str(train["num_chars_before_cleaning"].mean()))
print("Mean number of characters in test set: " + str(test["num_chars_before_cleaning"].mean()))
print("Mean number of characters in train set for target 0: " + str(train[train.target.eq(0)]["num_chars_before_cleaning"].mean()))
print("Mean number of characters in train set fir target 1: " + str(train[train.target.eq(1)]["num_chars_before_cleaning"].mean()))

Mean number of characters in train set: 101.03743596479706
Mean number of characters in test set: 102.10818265399939
Mean number of characters in train set for target 0: 95.70681713496084
Mean number of characters in train set fir target 1: 108.11342097217977


In [10]:
train["num_stopwords"] = train["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))
test["num_stopwords"] = test["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords]))

print("Mean number of stopwords in train set: " + str(train["num_stopwords"].mean()))
print("Mean number of stopwords in test set: " + str(test["num_stopwords"].mean()))
print("Mean number of stopwords in train set for target 0: " + str(train[train.target.eq(0)]["num_stopwords"].mean()))
print("Mean number of stopwords in train set for target 1: " + str(train[train.target.eq(1)]["num_stopwords"].mean()))

Mean number of stopwords in train set: 5.000131354262446
Mean number of stopwords in test set: 4.9567882316886305
Mean number of stopwords in train set for target 0: 5.328189774297559
Mean number of stopwords in train set for target 1: 4.564659125649649


In [11]:
train["num_punctuations"] =train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test["num_punctuations"] =test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

print("Mean number of punctuations in train set: " + str(train["num_punctuations"].mean()))
print("Mean number of punctuations in test set: " + str(test["num_punctuations"].mean()))
print("Mean number of punctuations in train set for target 0: " + str(train[train.target.eq(0)]["num_punctuations"].mean()))
print("Mean number of punctuations in train set for target 1: " + str(train[train.target.eq(1)]["num_punctuations"].mean()))

Mean number of punctuations in train set: 6.839485091291213
Mean number of punctuations in test set: 6.9506589028501375
Mean number of punctuations in train set for target 0: 6.30907415937356
Mean number of punctuations in train set for target 1: 7.54356465912565


In [12]:
train["mean_word_len_before_cleaning"] = train["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len_before_cleaning"] = test["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

print("Mean word length in train set: " + str(train["mean_word_len_before_cleaning"].mean()))
print("Mean word length in test set: " + str(test["mean_word_len_before_cleaning"].mean()))
print("Mean word length in train set for target 0: " + str(train[train.target.eq(0)]["mean_word_len_before_cleaning"].mean()))
print("Mean word length in train set fir target 1: " + str(train[train.target.eq(1)]["mean_word_len_before_cleaning"].mean()))

Mean word length in train set: 6.128493903251796
Mean word length in test set: 6.183041298176761
Mean word length in train set for target 0: 5.871324779540672
Mean word length in train set fir target 1: 6.469866063188727


In [13]:
#removing url tags
train['text'] = train['text'].apply(lambda x:removeurl(x))
test['text'] = test['text'].apply(lambda x:removeurl(x))

# data cleaning 
train['text'] = train['text'].apply(lambda x:spacy_tokenizer(x))
test['text'] = test['text'].apply(lambda x:spacy_tokenizer(x))

In [14]:
print(train)

         id keyword location  \
0         1     NaN      NaN   
1         4     NaN      NaN   
2         5     NaN      NaN   
3         6     NaN      NaN   
4         7     NaN      NaN   
...     ...     ...      ...   
7608  10869     NaN      NaN   
7609  10870     NaN      NaN   
7610  10871     NaN      NaN   
7611  10872     NaN      NaN   
7612  10873     NaN      NaN   

                                                   text  target  \
0                 deeds reason earthquake allah forgive       1   
1                 forest fire near la ronge sask canada       1   
2     residents asked shelter place notified officer...       1   
3     13,000 people receive wildfires evacuation ord...       1   
4     got sent photo ruby alaska smoke wildfires pou...       1   
...                                                 ...     ...   
7608  giant cranes holding bridge collapse nearby ho...       1   
7609  @aria_ahrary @thetawniest control wild fires c...       1   
7610  m1.94 

# Data exploration

Calculate the maximum number of tokens that a text can have in both train and test sets.

In [15]:
def calculate_nr_words(text):
    return len(text.split(" "))

In [16]:
print(max(train['text'].apply(calculate_nr_words)))
print(max(test['text'].apply(calculate_nr_words)))

25
23


Analyze the target distribution.

In [17]:
x = train['target'].value_counts()
non_real_disaster_tweets = x[0] * 100 / len(train['target'])
real_disaster_tweets = x[1] * 100 / len(train['target'])
print("%1.f %% of the data represents tweets that are not about real disasters" % (non_real_disaster_tweets))
print("%1.f %% of the data represents tweets that are about real disasters" % real_disaster_tweets)

57 % of the data represents tweets that are not about real disasters
43 % of the data represents tweets that are about real disasters


Analyze the first rows for each target type.

In [18]:
train[train.target.eq(0)]['text'].head(15)

15                 man
16         love fruits
17       summer lovely
18            car fast
19     goooooooaaaaaal
20     ridiculous ....
21      london cool ;)
22         love skiing
23       wonderful day
24            looooool
25    way ... eat shit
26            nyc week
27     love girlfriend
28           cooool :)
29          like pasta
Name: text, dtype: object

In [19]:
train[train.target.eq(1)]['text'].head(15)

0                 deeds reason earthquake allah forgive
1                 forest fire near la ronge sask canada
2     residents asked shelter place notified officer...
3     13,000 people receive wildfires evacuation ord...
4     got sent photo ruby alaska smoke wildfires pou...
5     rockyfire update california hwy 20 closed dire...
6     flood disaster heavy rain causes flash floodin...
7                                   hill fire woods ...
8        emergency evacuation happening building street
9                        afraid tornado coming area ...
10                            people died heat wave far
11    haha south tampa getting flooded hah- wait sec...
12    raining flooding florida tampabay tampa 18 19 ...
13                      flood bago myanmar arrived bago
14        damage school bus 80 multi car crash breaking
Name: text, dtype: object

In the above tables, I can see that the tweets related to real disasters contain more words than the tweets not related to real disasters after cleaning.

In [20]:
train[train.target.eq(0)]['text'].describe()

count     4342
unique    4278
top           
freq        26
Name: text, dtype: object

In [21]:
train[train.target.eq(0)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4342 entries, 15 to 7593
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                4342 non-null   int64  
 1   keyword                           4323 non-null   object 
 2   location                          2884 non-null   object 
 3   text                              4342 non-null   object 
 4   target                            4342 non-null   int64  
 5   num_words_before_cleaning         4342 non-null   int64  
 6   num_unique_words_before_cleaning  4342 non-null   int64  
 7   num_chars_before_cleaning         4342 non-null   int64  
 8   num_stopwords                     4342 non-null   int64  
 9   num_punctuations                  4342 non-null   int64  
 10  mean_word_len_before_cleaning     4342 non-null   float64
dtypes: float64(1), int64(7), object(3)
memory usage: 407.1+ KB


In [22]:
train[train.target.eq(1)]['text'].describe()

count     3271
unique    3182
top           
freq        24
Name: text, dtype: object

In [23]:
train[train.target.eq(1)].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3271 entries, 0 to 7612
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                3271 non-null   int64  
 1   keyword                           3229 non-null   object 
 2   location                          2196 non-null   object 
 3   text                              3271 non-null   object 
 4   target                            3271 non-null   int64  
 5   num_words_before_cleaning         3271 non-null   int64  
 6   num_unique_words_before_cleaning  3271 non-null   int64  
 7   num_chars_before_cleaning         3271 non-null   int64  
 8   num_stopwords                     3271 non-null   int64  
 9   num_punctuations                  3271 non-null   int64  
 10  mean_word_len_before_cleaning     3271 non-null   float64
dtypes: float64(1), int64(7), object(3)
memory usage: 306.7+ KB


In [24]:
train.describe()

Unnamed: 0,id,target,num_words_before_cleaning,num_unique_words_before_cleaning,num_chars_before_cleaning,num_stopwords,num_punctuations,mean_word_len_before_cleaning
count,7613.0,7613.0,7613.0,7613.0,7613.0,7613.0,7613.0,7613.0
mean,5441.934848,0.42966,14.903586,14.340733,101.037436,5.000131,6.839485,6.128494
std,3137.11609,0.49506,5.732604,5.27716,33.781325,3.703633,4.608758,1.675464
min,1.0,0.0,1.0,1.0,7.0,0.0,0.0,2.25
25%,2734.0,0.0,11.0,11.0,78.0,2.0,3.0,4.875
50%,5408.0,0.0,15.0,14.0,107.0,4.0,6.0,5.928571
75%,8146.0,1.0,19.0,18.0,133.0,7.0,10.0,7.058824
max,10873.0,1.0,31.0,29.0,157.0,20.0,61.0,19.333333


In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                7613 non-null   int64  
 1   keyword                           7552 non-null   object 
 2   location                          5080 non-null   object 
 3   text                              7613 non-null   object 
 4   target                            7613 non-null   int64  
 5   num_words_before_cleaning         7613 non-null   int64  
 6   num_unique_words_before_cleaning  7613 non-null   int64  
 7   num_chars_before_cleaning         7613 non-null   int64  
 8   num_stopwords                     7613 non-null   int64  
 9   num_punctuations                  7613 non-null   int64  
 10  mean_word_len_before_cleaning     7613 non-null   float64
dtypes: float64(1), int64(7), object(3)
memory usage: 654.4+ KB


In [26]:
train["num_words"] = train["text"].apply(lambda x: len(str(x).split()))
test["num_words"] = test["text"].apply(lambda x: len(str(x).split()))

print("Mean number of words in train set: " + str(train["num_words"].mean()))
print("Mean number of words in test set: " + str(test["num_words"].mean()))
print("Mean number of words in train set for target 0: " + str(train[train.target.eq(0)]["num_words"].mean()))
print("Mean number of words in train set fir target 1: " + str(train[train.target.eq(1)]["num_words"].mean()))

Mean number of words in train set: 9.49073952449757
Mean number of words in test set: 9.615078148942692
Mean number of words in train set for target 0: 8.808613542146476
Mean number of words in train set fir target 1: 10.396209110363802


In [27]:
train["num_unique_words"] = train["text"].apply(lambda x: len(set(str(x).split())))
test["num_unique_words"] = test["text"].apply(lambda x: len(set(str(x).split())))

print("Mean number of unique words in train set: " + str(train["num_unique_words"].mean()))
print("Mean number of unique words in test set: " + str(test["num_unique_words"].mean()))
print("Mean number of unique words in train set for target 0: " + str(train[train.target.eq(0)]["num_unique_words"].mean()))
print("Mean number of unique words in train set fir target 1: " + str(train[train.target.eq(1)]["num_unique_words"].mean()))

Mean number of unique words in train set: 9.255484040457112
Mean number of unique words in test set: 9.385841250383082
Mean number of unique words in train set for target 0: 8.620451404882543
Mean number of unique words in train set fir target 1: 10.098440843778661


In [28]:
train["mean_word_len"] = train["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len"] = test["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

print("Mean word length in train set: " + str(train["mean_word_len"].mean()))
print("Mean word length in test set: " + str(test["mean_word_len"].mean()))
print("Mean word length in train set for target 0: " + str(train[train.target.eq(0)]["mean_word_len"].mean()))
print("Mean word length in train set fir target 1: " + str(train[train.target.eq(1)]["mean_word_len"].mean()))

  out=out, **kwargs)


Mean word length in train set: 7.266202186609272
Mean word length in test set: 7.3111581972149695
Mean word length in train set for target 0: 7.078760130626297
Mean word length in train set fir target 1: 7.51535522437414


In [29]:
train["num_chars"] = train["text"].apply(lambda x: len(str(x)))
test["num_chars"] = test["text"].apply(lambda x: len(str(x)))

print("Mean number of characters in train set: " + str(train["num_chars"].mean()))
print("Mean number of characters in test set: " + str(test["num_chars"].mean()))
print("Mean number of characters in train set for target 0: " + str(train[train.target.eq(0)]["num_chars"].mean()))
print("Mean number of characters in train set fir target 1: " + str(train[train.target.eq(1)]["num_chars"].mean()))

Mean number of characters in train set: 75.9762248784973
Mean number of characters in test set: 77.26417407293901
Mean number of characters in train set for target 0: 68.87701520036849
Mean number of characters in train set fir target 1: 85.39987771323754
