### 1. load dataset

In [1]:
import pandas as pd
import json
import codecs
import gzip

import os
file_list = os.listdir('./data')

# load dataset.
data = []
for f in file_list:
    file_path = 'data/'+f
    if file_path.endswith('.gz'): 
        with gzip.GzipFile(file_path, 'r') as gz:
            for text in gz:
                data.append(json.loads(text))
    
# load labels.
labels = []
with open('TRECIS_2018_2019-labels.json', 'r') as f:
    for i in json.load(f):
        labels.append(i)

In [2]:
# some data are not labeled. 
print("the size of dataset: ",len(data))
print("the size of labeled dataset: ", len(labels))

the size of dataset:  50490
the size of labeled dataset:  38478


In [3]:
labels[0] # 'postID'.

{'eventID': 'joplinTornado2011',
 'eventName': '2011 Joplin Tornado',
 'eventDescription': "The 2011 Joplin tornado was a catastrophic EF5-rated multiple-vortex tornado that struck Joplin, Missouri, late in the afternoon of Sunday, May 22, 2011. The user is a response officer in the Missouri command and control center responsible for impact to the state. <a href='https://en.wikipedia.org/wiki/2011_Joplin_tornado' target='_blank'>Wikipedia Page<a>",
 'eventType': 'Unknown',
 'postID': '72676276212731904',
 'postCategories': ['Factoid', 'Hashtags', 'News'],
 'postPriority': 'Low'}

In [4]:
print("id: ", data[0]['allProperties']['docno'])
print("content: ", data[0]['allProperties']['content'])

id:  230634706436227073
content:  Philippine floods worsen as death toll hits 14 in wake of ‘Gener’ http://t.co/gnhkYRS6 #Philippines #national


In [5]:
# match labels and data.
labels_dict = dict()

for i, l in enumerate(labels):
    labels_dict[l['postID']] = i

In [6]:
# construct dataframe.
raw_dataset = pd.DataFrame(columns=['content', 'categories', 'priority'])

for doc in data:
    doc_id, doc_cont = doc['allProperties']['docno'], doc['allProperties']['content']
    if doc_id not in labels_dict: continue
    index = labels_dict[doc_id]
    raw_dataset = raw_dataset.append({'content': doc_cont, 'categories': labels[index]['postCategories'], 'priority': labels[index]['postPriority']}, ignore_index=True)

raw_dataset[:10]

Unnamed: 0,content,categories,priority
0,Philippine floods worsen as death toll hits 14...,"[ThirdPartyObservation, Factoid, News]",Low
1,Philippine flood fatalities hit 23 http://t.co...,"[ThirdPartyObservation, Factoid, News]",Low
2,4 Luzon dams release water; flood warning up: ...,"[ThirdPartyObservation, Factoid, News]",Low
3,RT @ANCALERTS: PAGASA 8:50pm advisory: Yellow ...,"[ThirdPartyObservation, Factoid, News]",Low
4,RT @ANCALERTS: PAGASA 4am advisory: Green warn...,"[ThirdPartyObservation, News]",Low
5,RT @PHLNewsOnline: [ANC] PAGASA 4am advisory:...,"[ThirdPartyObservation, News]",Low
6,RT @ANCALERTS: PAGASA 5pm advisory: Green warn...,"[ThirdPartyObservation, News]",Low
7,“@ANCALERTS: PAGASA 9pm advisory: RED warning ...,"[ThirdPartyObservation, EmergingThreats, News]",Medium
8,‎​PAGASA 9pm advisory: RED warning for Metro M...,"[ThirdPartyObservation, EmergingThreats, News]",Medium
9,"If floods end up banging up my Manila badly, c...","[ThirdPartyObservation, Advice]",Low


In [7]:
len(raw_dataset)

37373

### 2. data cleaning

In [8]:
import numpy as np
import re
from nltk.stem import PorterStemmer

stopwords = ['a','in','on','at','and','or', 
              'to', 'the', 'of', 'an', 'by', 
              'as', 'is', 'was', 'were', 'been', 'be', 
              'are','for', 'this', 'that', 'these', 'those', 'you', 'i', 'if',
             'it', 'he', 'she', 'we', 'they', 'will', 'have', 'has',
              'do', 'did', 'can', 'could', 'who', 'which', 'what',
              'but', 'not', 'there', 'no', 'does', 'not', 'so', 've', 'their',
             'his', 'her', 'they', 'them', 'from', 'with', 'its']

In [9]:
url_pattern = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
token_pattern = r"\b[A-Za-z0-9]+\b"
username_pattern = r"@[^\s]+"
hashtag_pattern = r"\B#\w\w+"

In [10]:
new_dataset = pd.DataFrame(columns=['content', 'categories', 'priority'])

for i in range(len(raw_dataset)):
    content = raw_dataset.loc[i, 'content']
    
    # remove url.
    content_url = re.sub(url_pattern, "", content)
    
    # remove @username. 
    content_username = re.sub(username_pattern, "", content_url)
    
    # remove #hashtag.
    content_hashtag = re.sub(hashtag_pattern, "", content_username)
    
    # lower case.
    content_lower = content_hashtag.lower()
    
    # tokenisation.
    words = re.findall(token_pattern, content_lower)
    
    # remove stop word.
    words = [word for word in words if word not in stopwords]
        
    # stemming. 
    ps = PorterStemmer() 
    words = [ps.stem(word) for word in words]
    
    content_processed = " ".join(words)
    
    new_dataset = new_dataset.append({'content': content_processed, 'categories': raw_dataset.loc[index, 'categories'], 'priority': raw_dataset.loc[index, 'priority']}, ignore_index=True)

new_dataset[:10]

Unnamed: 0,content,categories,priority
0,philippin flood worsen death toll hit 14 wake ...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
1,philippin flood fatal hit 23,"[FirstPartyObservation, MultimediaShare, Senti...",Low
2,4 luzon dam releas water flood warn up manila ...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
3,rt pagasa 8 50pm advisori yellow warn metro ma...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
4,rt pagasa 4am advisori green warn metro manila...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
5,rt anc pagasa 4am advisori green warn metro ma...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
6,rt pagasa 5pm advisori green warn metro manila...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
7,pagasa 9pm advisori red warn metro manila heav...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
8,pagasa 9pm advisori red warn metro manila heav...,"[FirstPartyObservation, MultimediaShare, Senti...",Low
9,flood end up bang up my manila badli consid ru...,"[FirstPartyObservation, MultimediaShare, Senti...",Low


In [11]:
print("dataset size: ", len(new_dataset))

print("dataset before cleaning: ", raw_dataset.loc[7, 'content'])

print("dataset after cleaning: ", new_dataset.loc[7, 'content'])

dataset size:  37373
dataset after cleaning:  pagasa 9pm advisori red warn metro manila heavi intens rain next 3 hr flood low lie area near river
