In [16]:
import os
import re
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm


In [17]:
def read_files(path):
    disaster_folders=os.listdir(path)
    
    data=[]
    uniq_labels={}
    
    for disaster_folder in disaster_folders:
        folder_path=os.path.join(path,disaster_folder)
        #print('folderPath',folder_path)
        labeled_csv_file=disaster_folder+'-tweets_labeled.csv'
        #print('labeled_csv_file',labeled_csv_file)
        csv_file_path=os.path.join(folder_path,labeled_csv_file)
        #print('csv_file_path',csv_file_path)
        with open(csv_file_path, newline='',encoding="utf8") as f:
            reader = csv.reader(f, delimiter=',')
            next(reader)
            for line in reader:
                text=line[1]
                label=line[3]
                if label=='Not labeled':
                    continue
                text=re.sub(r'http\S+', 'URL', text)
                if label not in uniq_labels:
                    uniq_labels[label]=label
                data.append({'text':text,'label':label})
    return data,uniq_labels
        

In [18]:
wf_data,wf_labels=read_files('./CrisisLexT26_Flood_Wildfire/wildfires')

In [19]:
flood_data,flood_labels=read_files('./CrisisLexT26_Flood_Wildfire/floods')

In [20]:
mm_data,mm_labels=read_files('./CrisisLexT26_Flood_Wildfire/manmade')

In [21]:
len(flood_data),len(wf_data),len(mm_data)

(2827, 1902, 3759)

In [22]:
flood_train ,flood_test = train_test_split(flood_data,test_size=0.25,random_state=11)

In [23]:
flood_train,flood_dev=train_test_split(flood_train,test_size=0.3,random_state=11)

In [24]:
mm_train,mm_test=train_test_split(mm_data,test_size=0.25,random_state=11)
mm_train,mm_dev=train_test_split(mm_train,test_size=0.3,random_state=11)

In [25]:
def convert_to_file(listt,file_name):
    with open(file_name, 'w',encoding="utf-8") as f:
        for item in listt:
            data=item['text']+'\t'+item['label']
            f.write("%s\n" % data)

In [26]:
convert_to_file(flood_train,'./CrisisLexT26_Flood_Wildfire/floods_processed/train.txt')

In [27]:
convert_to_file(flood_dev,'./CrisisLexT26_Flood_Wildfire/floods_processed/dev.txt')

In [28]:
convert_to_file(flood_test,'./CrisisLexT26_Flood_Wildfire/floods_processed/test.txt')

In [29]:
wf_train ,wf_test = train_test_split(wf_data,test_size=0.25,random_state=11)
wf_train ,wf_dev = train_test_split(wf_train,test_size=0.25,random_state=11)

In [30]:
convert_to_file(wf_train,'./CrisisLexT26_Flood_Wildfire/wildfires_processed/train.txt')
convert_to_file(wf_test,'./CrisisLexT26_Flood_Wildfire/wildfires_processed/test.txt')
convert_to_file(wf_dev,'./CrisisLexT26_Flood_Wildfire/wildfires_processed/test.txt')

In [31]:
convert_to_file(mm_train,'./CrisisLexT26_Flood_Wildfire/manmade_processed/train.txt')
convert_to_file(mm_test,'./CrisisLexT26_Flood_Wildfire/manmade_processed/test.txt')
convert_to_file(mm_dev,'./CrisisLexT26_Flood_Wildfire/manmade_processed/dev.txt')