In [42]:
import os
import re
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm


In [19]:
def read_files(path):
    disaster_folders=os.listdir(path)
    
    data=[]
    uniq_labels={}
    
    for disaster_folder in disaster_folders:
        folder_path=os.path.join(path,disaster_folder)
        #print('folderPath',folder_path)
        labeled_csv_file=disaster_folder+'-tweets_labeled.csv'
        #print('labeled_csv_file',labeled_csv_file)
        csv_file_path=os.path.join(folder_path,labeled_csv_file)
        #print('csv_file_path',csv_file_path)
        with open(csv_file_path, newline='',encoding="utf8") as f:
            reader = csv.reader(f, delimiter=',')
            next(reader)
            for line in reader:
                text=line[1]
                label=line[3]
                if label=='Not labeled':
                    continue
                text=re.sub(r'http\S+', 'URL', text)
                if label not in uniq_labels:
                    uniq_labels[label]=label
                data.append({'text':text,'label':label})
    return data,uniq_labels
        

In [25]:
wf_data,wf_labels=read_files('./CrisisLexT26_Flood_Wildfire/wildfires')

In [26]:
flood_data,flood_labels=read_files('./CrisisLexT26_Flood_Wildfire/floods')

In [29]:
flood_labels,wf_labels

({'Affected individuals': 'Affected individuals',
  'Caution and advice': 'Caution and advice',
  'Not applicable': 'Not applicable',
  'Other Useful Information': 'Other Useful Information',
  'Infrastructure and utilities': 'Infrastructure and utilities',
  'Sympathy and support': 'Sympathy and support',
  'Donations and volunteering': 'Donations and volunteering'},
 {'Other Useful Information': 'Other Useful Information',
  'Caution and advice': 'Caution and advice',
  'Affected individuals': 'Affected individuals',
  'Infrastructure and utilities': 'Infrastructure and utilities',
  'Donations and volunteering': 'Donations and volunteering',
  'Sympathy and support': 'Sympathy and support',
  'Not applicable': 'Not applicable'})

In [28]:
len(flood_data),len(wf_data)

(2827, 1902)

In [30]:
flood_train ,flood_test = train_test_split(flood_data,test_size=0.25,random_state=11)

In [32]:
flood_train,flood_dev=train_test_split(flood_train,test_size=0.3,random_state=11)

In [36]:
def convert_to_file(listt,file_name):
    with open(file_name, 'w',encoding="utf-8") as f:
        for item in listt:
            data=item['text']+'\t'+item['label']
            f.write("%s\n" % data)

In [37]:
convert_to_file(flood_train,'./CrisisLexT26_Flood_Wildfire/floods_processed/train.txt')

In [38]:
convert_to_file(flood_dev,'./CrisisLexT26_Flood_Wildfire/floods_processed/dev.txt')

In [39]:
convert_to_file(flood_test,'./CrisisLexT26_Flood_Wildfire/floods_processed/test.txt')

In [40]:
wf_train ,wf_test = train_test_split(wf_data,test_size=0.25,random_state=11)

In [41]:
convert_to_file(wf_train,'./CrisisLexT26_Flood_Wildfire/wildfires_processed/train.txt')
convert_to_file(wf_test,'./CrisisLexT26_Flood_Wildfire/wildfires_processed/test.txt')

In [59]:
class DisasterProcessor:
    """Data loader for SNLI."""

    def __init__(self):
        self.label_map = {'Affected individuals': 0, 'Caution and advice': 1, 'Not applicable': 2,'Other Useful Information':3,'Infrastructure and utilities':4,'Sympathy and support':5,'Donations and volunteering':6}

    def valid_inputs(self, sentence1, label):
        return len(sentence1) > 0  and label in self.label_map

    def load_samples(self, path):
        samples = []
        with open(path, newline='',encoding='utf-8') as f:
            reader = csv.reader(f, delimiter='\t')
            next(reader)  # skip header
            desc = f'loading \'{path}\''
            for row in tqdm(reader, desc=desc):
                try:
                    sentence1 = row[0]
                    label = row[1]
                    if self.valid_inputs(sentence1, label):
                        label = self.label_map[label]
                        samples.append((sentence1, label))
                except:
                    pass
        return samples

In [60]:
processor=DisasterProcessor()
samples=processor.load_samples('./CrisisLexT26_Flood_Wildfire/wildfires_processed/train.txt')

loading './CrisisLexT26_Flood_Wildfire/wildfires_processed/train.txt': 763it [00:00, 7574.67it/s]

['RT @NationalPrayer: PRAYER ALERT: Remain in prayer for those fighting the #WaldoCanyonFire in Colorado Springs &amp; the 32,000+ people  ...', 'Sympathy and support']
Sympathy and support
['RT @akcdoglovers: AKC CAR donates to help displaced animals in the Colorado wildfires URL', 'Donations and volunteering']
Donations and volunteering
['RT @Sarakateee: RIP Colorado. Cause of death: fire.', 'Sympathy and support']
Sympathy and support
['@PortDouglasSK yes some good rain would be very beneficial #nswfires', 'Other Useful Information']
Other Useful Information
['Wildfires worsen in Colorado, 32,000 flee homes URL via @reuters', 'Affected individuals']
Affected individuals
['RT @SheilaWalsh: Praying for divine intervention  and protection for all those affected by the fires in Colorado Springs. #waldocanyonfire', 'Sympathy and support']
Sympathy and support
['RT @PadsCast: The fires are in Colorado Springs but there is a haze in the sky in Denver and you can smell the smoke because of 

loading './CrisisLexT26_Flood_Wildfire/wildfires_processed/train.txt': 1425it [00:00, 7038.50it/s]


Caution and advice
['RT @ScubaGuev: Whoever is starting these fires in Colorado needs to die', 'Other Useful Information']
Other Useful Information
['High Park Wildfires Whip Through Colorado URL', 'Other Useful Information']
Other Useful Information
["RT @freddoso: A lot has happened since Obama last called Colorado's governor about the fires. 18 Obama fundraisers, for example URL ...", 'Other Useful Information']
Other Useful Information
['. @NSWRFS not in need of food/blankets -please donate to official sources.Full list can be found at our web #nswfires URL', 'Donations and volunteering']
Donations and volunteering
['All these bush fires in NSW make me appreciate living in Brisbane. Although I have the threat of floods……', 'Sympathy and support']
Sympathy and support
['RT @YMontepara: Colorado wildfires wreck homes but officials hope for break in weather URL', 'Infrastructure and utilities']
Infrastructure and utilities
['RT @cnnireport: CO #wildfire changing the landscape, and th




In [61]:
samples

[('RT @NationalPrayer: PRAYER ALERT: Remain in prayer for those fighting the #WaldoCanyonFire in Colorado Springs &amp; the 32,000+ people  ...',
  5),
 ('RT @akcdoglovers: AKC CAR donates to help displaced animals in the Colorado wildfires URL',
  6),
 ('RT @Sarakateee: RIP Colorado. Cause of death: fire.', 5),
 ('@PortDouglasSK yes some good rain would be very beneficial #nswfires', 3),
 ('Wildfires worsen in Colorado, 32,000 flee homes URL via @reuters', 0),
 ('RT @SheilaWalsh: Praying for divine intervention  and protection for all those affected by the fires in Colorado Springs. #waldocanyonfire',
  5),
 ('RT @PadsCast: The fires are in Colorado Springs but there is a haze in the sky in Denver and you can smell the smoke because of the high ...',
  3),
 ('RT @KimberleyL: :) RT @alexlee_24: @KimberleyL Two fires at Sydney Water and Sydney Aquatic Centre within a week. When did fire develop a s…',
  2),
 ('Thinking of my bestie in colorado.. So many wildfires and not the latest one 