# Disaster Tweets Analysis

### 1 &ensp; Setup

In [40]:
# Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate

from xgboost import XGBClassifier

# Sentence Encoding
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

### 2 &ensp; EDA

Each sample in the train and test set has the following information:

* The `text` of a tweet
* A `keyword` from that tweet
* The `location` the tweet was sent from

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

print(f"train: {train_data.shape}, test: {test_data.shape}")

train: (7613, 5), test: (3263, 4)


In [17]:
train_data['target'].sum(), len(train_data['target']) - train_data['target'].sum()

(3271, 4342)

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [6]:
100 * train_data.isna().sum() / len(train_data)

id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64

In [7]:
text_length = train_data['text'].apply(len)
text_length.describe()

count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: text, dtype: float64

In [8]:
keywords = train_data['keyword'].value_counts()
print(keywords)

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64


In [9]:
locations = train_data['location'].value_counts()
print(locations)

USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64


### 3 &ensp; Preprocessing

In [10]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')

X_train = sbert.encode(train_data['text'])
y_train = train_data['target']
X_test = sbert.encode(test_data['text'])

In [11]:
print(X_train.shape, y_train.shape, X_test.shape)

(7613, 384) (7613,) (3263, 384)


### 4 &ensp; Modeling

In [20]:
xgb = XGBClassifier()
mlp = MLPClassifier(hidden_layer_sizes=[128, 32])

scorers = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

xgb_scores = cross_validate(xgb, X_train, y_train, scoring=scorers)
mlp_scores = cross_validate(mlp, X_train, y_train, scoring=scorers)

In [22]:
xgb_scores, mlp_scores

({'fit_time': array([26.17400861, 28.253057  , 32.98841739, 29.6294589 , 29.04147959]),
  'score_time': array([0.04101777, 0.03999901, 0.07157898, 0.03766918, 0.03999567]),
  'test_accuracy': array([0.79054498, 0.74720946, 0.76493762, 0.77069645, 0.81603154]),
  'test_f1': array([0.72664953, 0.68775345, 0.72248062, 0.70746018, 0.79166667]),
  'test_roc_auc': array([0.77297288, 0.73497605, 0.75845553, 0.75523366, 0.815714  ])},
 {'fit_time': array([ 8.84286475,  5.1874826 ,  6.37698054,  8.33567452, 15.79202819]),
  'score_time': array([0.01552987, 0.0115304 , 0.01504827, 0.01499844, 0.01300502]),
  'test_accuracy': array([0.7760998 , 0.7202889 , 0.71634931, 0.73061761, 0.79106439]),
  'test_f1': array([0.72610442, 0.63084922, 0.68786127, 0.69494048, 0.76923077]),
  'test_roc_auc': array([0.76554508, 0.7000366 , 0.71776938, 0.72857742, 0.79344762])})

In [45]:
metrics = ['test_accuracy', 'test_f1', 'test_roc_auc']

for m in metrics:
    print(f"XGB {m}: {xgb_scores[m].mean()}")
    print(f"MLP {m}: {mlp_scores[m].mean()}")

XGB test_accuracy: 0.7778840089283634
MLP test_accuracy: 0.7468840028886896
XGB test_f1: 0.7272020893628752
MLP test_f1: 0.7017972309744429
XGB test_roc_auc: 0.7674704249229194
MLP test_roc_auc: 0.7410752201133428


### 5 &ensp; Evaluation

In [None]:
xgb.fit(X_train, y_train)

In [53]:
pd.DataFrame({
    "id": test_data["id"],
    "target": xgb.predict(X_test)
}).to_csv("data/submission.csv", index=False)