In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df = pd.read_csv('data/train.csv')
print(df.head(5))

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


# Statistics

In [None]:
df_length = len(df)

cnt_disasters = len(df[df['target'] == 1])
cnt_not_disasters = df_length - cnt_disasters

print(f"Number of events: {df_length}")
print(f"Number of disasters: {cnt_disasters}")
print(f"Number of false disasters: {cnt_not_disasters}")
print()

prob_disasters = cnt_disasters / df_length
print(f"Percenrage of disasters: {100 * prob_disasters:0.2f}%")
print(f"Percentage of false disasters: {100 * (1 - prob_disasters):0.2f}%")
print()

condition_disaster_location = (df['target'] == 1) & (df['location'].isna() == False)
cnt_disasters_location = len(df[condition_disaster_location]) / cnt_disasters
print(f"Percentage of disasters with location provided: {100 * cnt_disasters_location:0.2f}%")
print(f"Percentage of disasters without location provided: {100 * (1 - cnt_disasters_location):0.2f}%")
print()

condition_disaster_keyword = (df['target'] == 1) & (df['keyword'].isna() == False)
cnt_disasters_keyword = len(df[condition_disaster_keyword]) / cnt_disasters
print(f"Percentage of disasters with location provided: {100 * cnt_disasters_keyword:0.2f}%")
print(f"Percentage of disasters without location provided: {100 * (1 - cnt_disasters_keyword):0.2f}%")
print()


In [None]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
print(stopwords.words('english'))

In [2]:
import re
import string
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english'))

def remove_links(text):
    pattern = r"https?://\S+"
    text = re.sub(pattern, '', text)
    return text

def remove_numbers(text):
    pattern = r"\d+"
    text = re.sub(pattern, "", text)
    return text

def preprocess(text: str):
    text = text.lower()
    text = remove_links(text) 
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = remove_numbers(text)
    text = " ".join([w for w in text.split() if w not in stop_words])
    return text

In [None]:
dataset_complete = df[['keyword', 'location', 'text']].fillna('').agg(' '.join, axis=1).str.strip()
print(dataset_complete.head(5))

dataset = dataset_complete.apply(preprocess)
dataset = dataset.values.tolist()

In [7]:
dataset_clear = df[['text']].agg(' '.join, axis=1)
print(dataset_clear.head(5))

dataset = dataset_clear.apply(preprocess)
dataset = dataset.values.tolist()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
dtype: object


In [None]:
dataset_complete = df[['keyword', 'text']].fillna('').agg(' '.join, axis=1).str.strip()
print(dataset_complete.head(5))

dataset = dataset_complete.apply(preprocess)
dataset = dataset.values.tolist()

In [None]:
dataset_complete = df[['location', 'text']].fillna('').agg(' '.join, axis=1).str.strip()
print(dataset_complete.head(5))

dataset = dataset_complete.apply(preprocess)
dataset = dataset.values.tolist()

In [None]:
# TFIDF + DecisionTree

# F1 Score = F1 Score * 100
# Keyword + location + text -> F1 Score 66.82
# Keyword + text            -> F1 Score 68.39
# Location + text           -> F1 Score 68.27
# Text                      -> F1 Score 69.02

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(dataset)
y = df['target'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = f1_score(y_test, y_pred)
print(f"F1 Score: {100 * accuracy:0.3f}")

In [None]:
# HashVectorizer + DecisionTree
# F1Score = F1Score * 100
# Text + n_features=32-> F1 Score - 52.94
# Text + n_features=4196 -> F1 Score - 66.57

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

vectorizer = HashingVectorizer(n_features=4196)
X = vectorizer.transform(dataset)
y = df['target'].tolist()
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = f1_score(y_test, y_pred)
print(f"F1 Score: {100 * accuracy:0.3f}")

In [None]:
# Word2Vec
# Text -> 53.20
from gensim.models import Word2Vec
import numpy as np

tokenized_docs = [doc.split() for doc in dataset]
w2v_model = Word2Vec(tokenized_docs, vector_size=64, window=5, min_count=1, workers=16)

def document_vector(doc):
    tokens = doc.split()
    vectors = [w2v_model.wv[token] for token in tokens if token in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(w2v_model.vector_size)

doc_vectors = [document_vector(doc) for doc in dataset]
X = np.array(doc_vectors)
y = df['target'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = f1_score(y_test, y_pred)
print(f"F1 Score: {100 * accuracy:0.3f}")

In [50]:
dataset_clear = df[['text']].agg(' '.join, axis=1)

dataset = dataset_clear.apply(preprocess)
print(dataset.head(5))
dataset = dataset.values.tolist()

# TFIDF + Logistic Regression

# F1 Score = F1 Score * 100
# Keyword + location + text -> F1 Score 66.82
# Keyword + text            -> F1 Score 68.39
# Location + text           -> F1 Score 68.27
# Text                      -> F1 Score 73.14

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(dataset)
y = df['target'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

print(X_test.shape)
y_pred = clf.predict(X_test)
accuracy = f1_score(y_test, y_pred)
print(f"F1 Score: {100 * accuracy:0.3f}")

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
dtype: object
(1523, 16850)
F1 Score: 73.141


In [None]:
# Test sampling
df_test = pd.read_csv('data/test.csv')
dataset_clear = df_test[['text']].agg(' '.join, axis=1)

dataset = dataset_clear.apply(preprocess)
print(dataset.head(5))
dataset = dataset.values.tolist()

X_test = vectorizer.transform(dataset)

print(X_test.shape)
y_pred = clf.predict(X_test)

0                          happened terrible car crash
1    heard earthquake different cities stay safe ev...
2    forest fire spot pond geese fleeing across str...
3                apocalypse lighting spokane wildfires
4                  typhoon soudelor kills china taiwan
dtype: object
(3263, 16850)


In [57]:
df_test = df_test.drop(['keyword', 'location', 'text'], axis=1)

In [62]:
df_test['target'] = y_pred
df_test[:10]

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [60]:
df_test.to_csv('submission.csv', index=False)