# Source
- Original Paper: https://ojs.aaai.org/index.php/ICWSM/article/view/18115
- GitHub benchmarks: https://github.com/firojalam/crisis_datasets_benchmarks
- Dataset URL: https://crisisnlp.qcri.org/crisis_datasets_benchmarks.html

# Preprocess

In [1]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
import glob
import nltk
import os
import re


In [5]:
base_dir = "_data/1_Fast_CrisisNLP_Twitter_Disaster/"
train_file = os.path.join(base_dir, "crisis_consolidated_humanitarian_filtered_lang_en_train.tsv")
dev_file = os.path.join(base_dir, "crisis_consolidated_humanitarian_filtered_lang_en_dev.tsv")
test_file = os.path.join(base_dir, "crisis_consolidated_humanitarian_filtered_lang_en_test.tsv")

# Read each file into a DataFrame
train_df = pd.read_csv(train_file, sep='\t')
dev_df = pd.read_csv(dev_file, sep='\t')
test_df = pd.read_csv(test_file, sep='\t')

# Add a 'group' column to each DataFrame
train_df['group'] = 'train'
dev_df['group'] = 'dev'
test_df['group'] = 'test'

# Concatenate all DataFrames into one
combined_df = pd.concat([train_df, dev_df, test_df], ignore_index=True)

print(f"Total rows: {len(combined_df)}")
print(combined_df['group'].value_counts())
combined_df

Total rows: 87455
group
train    61164
test     17356
dev       8935
Name: count, dtype: int64


Unnamed: 0,id,event,source,text,lang,lang_conf,class_label,group
0,18582,disaster_events,drd-figureeight-multimedia,Approximately 100km long firebreaks have been ...,en,1.0,infrastructure_and_utilities_damage,train
1,592616302138658817,2015_nepal_earthquake,crisisnlp-volunteers,God bless you... https://t.co/AnEy1ydkkz,en,,not_humanitarian,train
2,503643491143282688,2014_california_earthquake,crisisnlp-cf,"RT @perreaux: Cracked wine casks, damaged hist...",en,,infrastructure_and_utilities_damage,train
3,323833109051228160,2013_boston_bombings-ontopic,crisislext6,I'm really just excited for new undies and pin...,en,1.0,not_humanitarian,train
4,508333923886067712,2014_pakistan_floods,crisisnlp-cf,"Rescue effort expands in India, Pakistan as fl...",en,1.0,injured_or_dead_people,train
...,...,...,...,...,...,...,...,...
87450,508338283495960577,2014_pakistan_floods,crisisnlp-cf,RT @Donkeyji: Ex Australian PM Kevin Rudd help...,en,1.0,donation_and_volunteering,test
87451,912959167417192448,hurricane_maria,crisismmd,RT @CARICOMorg: #HurricaneMaria could cost Dom...,en,,infrastructure_and_utilities_damage,test
87452,451189795679522816,2014_chile_earthquake,crisisnlp-cf,8.2 quake in Chile and there's a likely chance...,en,1.0,sympathy_and_support,test
87453,19677,disaster_events,drd-figureeight-multimedia,The provision of water purification systems an...,en,1.0,requests_or_needs,test


In [None]:
# We can drop the language since the dataset is an english subset of the original and we dont need the twitter id-s
combined_df = combined_df.drop(['lang','lang_conf','id'], axis=1)
combined_df

Unnamed: 0,event,source,text,class_label,group
0,disaster_events,drd-figureeight-multimedia,Approximately 100km long firebreaks have been ...,infrastructure_and_utilities_damage,train
1,2015_nepal_earthquake,crisisnlp-volunteers,God bless you... https://t.co/AnEy1ydkkz,not_humanitarian,train
2,2014_california_earthquake,crisisnlp-cf,"RT @perreaux: Cracked wine casks, damaged hist...",infrastructure_and_utilities_damage,train
3,2013_boston_bombings-ontopic,crisislext6,I'm really just excited for new undies and pin...,not_humanitarian,train
4,2014_pakistan_floods,crisisnlp-cf,"Rescue effort expands in India, Pakistan as fl...",injured_or_dead_people,train
...,...,...,...,...,...
87450,2014_pakistan_floods,crisisnlp-cf,RT @Donkeyji: Ex Australian PM Kevin Rudd help...,donation_and_volunteering,test
87451,hurricane_maria,crisismmd,RT @CARICOMorg: #HurricaneMaria could cost Dom...,infrastructure_and_utilities_damage,test
87452,2014_chile_earthquake,crisisnlp-cf,8.2 quake in Chile and there's a likely chance...,sympathy_and_support,test
87453,disaster_events,drd-figureeight-multimedia,The provision of water purification systems an...,requests_or_needs,test


In [8]:
print(f"Dataset created with {len(combined_df)} tweets across {combined_df['event'].nunique()} events")
print(f"Event distribution:\n{combined_df['event'].value_counts()}")

print(f"Dataset created with {len(combined_df)} tweets across {combined_df['class_label'].nunique()} classes")
print(f"Class distribution:\n{combined_df['class_label'].value_counts()}")

Dataset created with 87455 tweets across 60 events
Event distribution:
event
disaster_events                            12551
2015_nepal_earthquake                       9653
2014_philippines_typhoon-hagupit            5766
2013_oklahoma_tornado-ontopic               4988
2013_west_texas                             4624
2013_alberta_floods-ontopic                 4562
2013_queensland_floods-ontopic              4390
2013_boston_bombings-ontopic                4176
2012_sandy_hurricane-ontopic                3679
2014-2015_worldwide_landslides              2797
hurricane_maria                             2734
hurricane_harvey                            2554
hurricane_irma                              2059
2015_vanuatu_cyclone                        1448
2014_chile_earthquake                       1437
2014_pakistan_floods                        1009
2013_pakistan_earthquake                     997
2014_philippines_typhoon                     957
california_wildfires                     

# Cleaning and normalization

In [11]:
# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Define stop words and initialize stemmer/lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andras.janko\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andras.janko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andras.janko\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def twitter_preprocess(text):
    if isinstance(text, str):  # Check if text is a string
        # Convert to lowercase
        text = text.lower()

        # Remove or replace Twitter-specific elements
        # 1. Handle retweets (RT @username:)
        text = re.sub(r'^rt @\w+: ', '', text)

        # 2. Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)

        # 3. Handle hashtags -> Keep the word but remove the # symbol
        text = re.sub(r'#(\w+)', r'\1', text)

        # 4. Handle mentions -> Remove mentions completely
        text = re.sub(r'@\w+', '', text)

        # 5. Remove emojis (simplified approach)
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # 6. Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', ' ', text)

        # 7. Remove numbers
        text = re.sub(r'\d+', ' ', text)

        # 8. Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text
    return ""  # Return empty string for non-string inputs

combined_df['cleaned_text'] = combined_df['text'].apply(twitter_preprocess)
combined_df


Unnamed: 0,event,source,text,class_label,group,cleaned_text
0,disaster_events,drd-figureeight-multimedia,Approximately 100km long firebreaks have been ...,infrastructure_and_utilities_damage,train,approximately km long firebreaks have been con...
1,2015_nepal_earthquake,crisisnlp-volunteers,God bless you... https://t.co/AnEy1ydkkz,not_humanitarian,train,god bless you
2,2014_california_earthquake,crisisnlp-cf,"RT @perreaux: Cracked wine casks, damaged hist...",infrastructure_and_utilities_damage,train,cracked wine casks damaged historical building...
3,2013_boston_bombings-ontopic,crisislext6,I'm really just excited for new undies and pin...,not_humanitarian,train,i m really just excited for new undies and pin...
4,2014_pakistan_floods,crisisnlp-cf,"Rescue effort expands in India, Pakistan as fl...",injured_or_dead_people,train,rescue effort expands in india pakistan as flo...
...,...,...,...,...,...,...
87450,2014_pakistan_floods,crisisnlp-cf,RT @Donkeyji: Ex Australian PM Kevin Rudd help...,donation_and_volunteering,test,ex australian pm kevin rudd helping flood vict...
87451,hurricane_maria,crisismmd,RT @CARICOMorg: #HurricaneMaria could cost Dom...,infrastructure_and_utilities_damage,test,hurricanemaria could cost dominica billions of...
87452,2014_chile_earthquake,crisisnlp-cf,8.2 quake in Chile and there's a likely chance...,sympathy_and_support,test,quake in chile and there s a likely chance of ...
87453,disaster_events,drd-figureeight-multimedia,The provision of water purification systems an...,requests_or_needs,test,the provision of water purification systems an...


In [None]:
# lemmatization
def normalize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

combined_df['normalized_text'] = combined_df['cleaned_text'].apply(normalize_text)

print("Original text:")
print(combined_df['text'].iloc[0][:200])
print("\nCleaned text:")
print(combined_df['cleaned_text'].iloc[0][:200])
print("\nNormalized text:")
print(combined_df['normalized_text'].iloc[0][:200])

combined_df.to_csv("_data/1_Fast_CrisisNLP_Twitter_Disaster/crisis_aggregated_preprocessed.csv", index=False)
combined_df

Original text:
Approximately 100km long firebreaks have been constructed by Indonesia and elsewhere.

Cleaned text:
approximately km long firebreaks have been constructed by indonesia and elsewhere

Normalized text:
approximately km long firebreak constructed indonesia elsewhere


Unnamed: 0,event,source,text,class_label,group,cleaned_text,normalized_text
0,disaster_events,drd-figureeight-multimedia,Approximately 100km long firebreaks have been ...,infrastructure_and_utilities_damage,train,approximately km long firebreaks have been con...,approximately km long firebreak constructed in...
1,2015_nepal_earthquake,crisisnlp-volunteers,God bless you... https://t.co/AnEy1ydkkz,not_humanitarian,train,god bless you,god bless
2,2014_california_earthquake,crisisnlp-cf,"RT @perreaux: Cracked wine casks, damaged hist...",infrastructure_and_utilities_damage,train,cracked wine casks damaged historical building...,cracked wine cask damaged historical building ...
3,2013_boston_bombings-ontopic,crisislext6,I'm really just excited for new undies and pin...,not_humanitarian,train,i m really just excited for new undies and pin...,really excited new undies pinkberry
4,2014_pakistan_floods,crisisnlp-cf,"Rescue effort expands in India, Pakistan as fl...",injured_or_dead_people,train,rescue effort expands in india pakistan as flo...,rescue effort expands india pakistan flood dea...
...,...,...,...,...,...,...,...
87450,2014_pakistan_floods,crisisnlp-cf,RT @Donkeyji: Ex Australian PM Kevin Rudd help...,donation_and_volunteering,test,ex australian pm kevin rudd helping flood vict...,ex australian pm kevin rudd helping flood vict...
87451,hurricane_maria,crisismmd,RT @CARICOMorg: #HurricaneMaria could cost Dom...,infrastructure_and_utilities_damage,test,hurricanemaria could cost dominica billions of...,hurricanemaria could cost dominica billion dol...
87452,2014_chile_earthquake,crisisnlp-cf,8.2 quake in Chile and there's a likely chance...,sympathy_and_support,test,quake in chile and there s a likely chance of ...,quake chile likely chance tsunami omg prayforc...
87453,disaster_events,drd-figureeight-multimedia,The provision of water purification systems an...,requests_or_needs,test,the provision of water purification systems an...,provision water purification system well clean...
