In [1]:
import re

In [2]:
import pandas as pd

In [3]:
import csv

In [4]:
# read csv containing Dengue articles to a dataframe
df = pd.read_csv('TestDengueContent.csv')

In [5]:
# Fill missing values
df = df.fillna("")

In [6]:
print(df.shape)

(479, 4)


In [7]:
# drops duplicate entries
df.drop_duplicates(subset=None, keep='first', inplace=True)

In [8]:
print(df.shape)

(478, 4)


In [9]:
# Regex for publication date ####-##-##
regex_for_publication_date = r"\d{4}-\d{2}-\d{2}"

In [10]:
regex_for_numbers = re.compile(r'\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*');

In [39]:
# a number following 'dengue' or 'cases' or 'suspected dengue cases' or 'suspected cases'
regex_for_cases = re.compile(r'\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue\scases|dengue\sfever\scases|dengue|cases|suspected\sdengue\scases|suspected\scases)', re.IGNORECASE)

In [40]:
# regex for deaths or patients(TODO move patients to no.of cases)
regex_for_deaths = re.compile(r"""
(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|
(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|
(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))
\s*
(?:deaths|death|dead(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead)|patients|dengue\srelated\deaths)
""", re.VERBOSE | re.IGNORECASE)

In [13]:
# regex to extract sentences having no. of cases/deaths
# TODO handle for "i.e" 
regex_for_sentences = re.compile(r"[^.]*?(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue|cases|suspected\sdengue\scases|suspected\scases)|(?:(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))\s*(?:deaths|death|dead(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead)|patients)))[^.]*\.")

In [14]:
# regex for dates
regex_for_dates = re.compile(r"""(?:\d{4}(?:\.|/|-)\d{1,2}(?:\.|/|-)\d{1,2})|                                                                       # dates with format xxxx.xx.xx or xxxx/xx/xx or xxxx-xx-xx (month and date: 1 or 2 digits)
(?:\d{1,2}(?:\.|/|-)\d{1,2}(?:\.|/|-)\d{2,4})|                                                                                                      # dates with format xx.xx.xx where . or / or - used as the seperator, 1 or 2 digits for date or month, 2 or 4 digits for year
(?:(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|April|May|June|July|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s*\d{0,2})     # dates with format: month xx (date with 0-2 digits)
""", re.VERBOSE)

In [15]:
# regex_for_years 20xx
regex_for_years = re.compile(r'\b20(?:\d{2}){1}\b|this\syear')

In [16]:
# regex for months
regex_for_months = re.compile(r'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|April|May|June|July|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)')

In [55]:
# regex for "so far"
regex_for_so_far = re.compile(r'so\sfar')

In [17]:
# regex for districts
regex_for_districts = re.compile(r'Jaffna|Kilinochchi|Mannar|Mullaitivu|Vavuniya|Puttalam|Kurunegala|Gampaha|Colombo|Kalutara|Anuradhapura|Polonnaruwa|Matale|Kandy|Nuwara\sEliya|Kegalle|Ratnapura|Trincomalee|Trinco|Batticaloa|Ampara|Badulla|Monaragala|Hambantota|Matara|Galle', re.IGNORECASE)

In [18]:
# regex for provinces
regex_for_provinces = re.compile(r'Western|Central|Eastern|North\sCentral|Northern|North\sWestern|Sabaragamuwa|Southern|Uva')

In [54]:
# regex for other places/locations
regex_for_places = re.compile(r'Sri Lanka|island|country|countrywide|Meethotamulla|Negombo|Ratmalana|Dangolla|Elpitiya|Beliatte|Hatton|Marakolliya|Gampola|Kalubowila')

In [19]:
def get_sentences(text):
    return regex_for_sentences.findall(text)

In [20]:
def get_published_date(text):
    return re.search(regex_for_publication_date, text).group()

In [21]:
def get_numbers(text):
    return regex_for_numbers.findall(text)

In [41]:
def get_cases(text):
    return regex_for_cases.findall(text)

In [42]:
def get_deaths(text):
    return regex_for_deaths.findall(text)       

In [24]:
def get_years(text):
    return regex_for_years.findall(text)

In [25]:
def get_months(text):
    return regex_for_months.findall(text)

In [26]:
def get_dates(text):
    return regex_for_dates.findall(text)

In [56]:
def get_so_far(text):
    return regex_for_so_far.findall(text)

In [27]:
def get_provinces(text):
    return regex_for_provinces.findall(text)

In [28]:
def get_districts(text):
    return regex_for_districts.findall(text)

In [50]:
def get_places(text):
    return regex_for_places.findall(text)

In [None]:
locations=["Meethotamulla","Colombo", "kandy","Anuradhapura","Galle","Matara","Ratmalana","Dangolla","Badulla","Elpitiya","Beliatte","Hatton","Marakolliya","Gampola","Kalubowila","Puttalam",]

In [43]:
articles=[]

In [59]:
articles=[]
for index, row in df.iterrows():
    print("Index: "+str(index))
#     print("Publication date: "+get_published_date(row['Date']))
    sentences = get_sentences(row['Article_content'])
#     print(sentences)
    for sentence in sentences:
        print(sentence)
#         get_data(sentence)
        
        articles.append({
                'Index':str(index),
                'Publication_date':get_published_date(row['Date']),
                'Sentence':sentence.encode("utf-8"),
                'Numbers':get_numbers(sentence),
                'Cases':get_cases(sentence),
                'Deaths':get_deaths(sentence),
                'Years':get_years(sentence),
                'Months':get_months(sentence),
                'Dates': get_dates(sentence),
                'Time_duration':get_so_far(sentence),
                'Provinces':get_provinces(sentence),
                'Districts':get_districts(sentence),
                'Places':get_places(sentence),
            })
        
#     print()

Index: 0
Index: 1
Index: 2
Index: 3
Index: 4
Index: 5
Index: 6
Index: 7
Index: 8
Index: 9
Index: 10
Index: 11
Index: 12
Index: 13
Index: 14
Index: 15
Index: 16
Index: 17
Index: 18
Index: 19
Enduring support of the respective authorities and positive contribution of the UA staff members across the branch network has led Union Manushyathwaya to successfully conduct over 50 dengue awareness programmes, over 100 thalassemia awareness programmes and 45 diabetes awareness programmes.
Index: 20
� �Haemophilia�This is a very rare disease, with only around 2000 patients being present in all of Sri Lanka.
Index: 21
Index: 22
Index: 23
Index: 24
Index: 25
Index: 26
Index: 27
The intermittent rains experienced in many parts of the country right now has increased the threat of a dengue outbreak, while the number of dengue cases reported in the first five weeks of 2018 stood at 6,203 by today, with five deaths.
The Government Epidemiology Unit (GEU) announced today that during the last 12 months of 

 In addition, heat-related deaths in the region among the elderly are expected to increase by about 52,000 cases by 2050 due to climate change, according to data from the World Health Organization.
Index: 143
Index: 144
e 182 days, there were 77000 dengue patients reported.
 As the disease lasts on average 1 week, on any day in the country there were 77000/182 x 7 = 2962 patients either at home or in hospital.
 All of the dengue viruses in Sri Lanka were inside these 2962 patients (and of course the infected mosquitoes).
Index: 145
Index: 146
 ��According to statistics available with the hospital system, there are as many as 83,000 cases reported so far.
Index: 147
Index: 148
Index: 149
 So far this year nearly 76,000 cases of dengue have been reported countrywide with some patients including children succumbing to this lethal affliction.
Index: 150
Index: 151
Index: 152
Index: 153
 Last week the ministry said there were about 70,000 victims with some 200 deaths but since then no stati

Index: 230
Index: 231
Index: 232
Index: 233
Index: 234
Index: 235
Index: 236
Index: 237
Index: 238
Index: 239
 ��In 2016, there had been 54,945 cases reported while only 75 have died.
Index: 240
Index: 241
Index: 242
Some 3500 dengue patients have been found during the past three months of this year throughout the country of which 11 patients had died, Coordinator of the Dengue Eradication Unit of the Health Ministry, Dr.
Index: 243
Within the year 2016, with the support of the Public Health Department of the Colombo Municipal Council and the Public Health Inspector, UA was able to successfully conduct a total of 57 dengue awareness campaigns, over 60 thalassemia awareness programs and 14 diabetes prevention programmes.
 As a result of these efforts, 55 dengue prevention programmes were carried out simultaneously on a single day which was asignificant achievement.
Index: 244
Index: 245
Index: 246
Index: 247
 He has completely cured eight patients suffering from HIV and nearly 75 patien

The Daily Mirror on Wednesday reported the sad news of the loss of 220 precious lives out of 44,395 dengue cases reported last year.
Index: 366
Index: 367
Index: 368
Index: 369
Index: 370
Index: 371
Index: 372
Index: 373
Index: 374
Index: 375
Index: 376
Index: 377
Index: 378
Index: 379
Index: 380
Index: 381
Index: 382
Index: 383
14, 500 dengue patients and 80 deaths for the last five months are no forgivable numbers.
Index: 384
Index: 385
Index: 386
		Meanwhile the Government Epidemiology Unit (GEU) said 5,802 suspected dengue cases and 24 deaths have been reported countrywide during the first two months this year.
Index: 387
Index: 388
Index: 389
Index: 390
Health authorities have warned that the Ratmalana Railway workshop has been a breeding ground for dengue mosquitoes and that 80 dengue positive patients were reported from the area.
Index: 391
Index: 392
Index: 393
Index: 394
Index: 395
In 2010, 22,926 patients and 241 deaths were reported.
 However, only 22,926 patients and 163 de

In [60]:
# write data in the 'article' list to a csv
with open('Regex_output.csv',mode='w') as csv_file:
    fieldnames=['Index','Publication_date','Sentence','Numbers','Cases','Deaths','Years','Months','Dates','Time_duration','Provinces','Districts','Places']
    writer=csv.DictWriter(csv_file,fieldnames=fieldnames)

    writer.writeheader()
    for article in articles:
        writer.writerow({'Index':article['Index'],'Publication_date':article['Publication_date'],'Sentence':article['Sentence'],'Numbers':article['Numbers'],'Cases':article['Cases'],'Deaths':article['Deaths'],'Years':article['Years'],'Months':article['Months'],'Dates':article['Dates'],'Time_duration':article['Time_duration'],'Provinces':article['Provinces'],'Districts':article['Districts'],'Places':article['Places']})


In [44]:
import re

locations=["Meethotamulla","Colombo", "kandy","Anuradhapura","Galle","Matara","Ratmalana","Dangolla","Badulla","Elpitiya","Beliatte","Hatton","Marakolliya","Gampola","Kalubowila","Puttalam",]

import pandas as pd
import csv

df = pd.read_csv('TestDengueContent.csv')
df = df.fillna("")
print(df.shape)

for index, row in df.iterrows():
        for line in (row):
            i=0
            while i<len(locations):
                if locations[i] in line:
                    print(locations[i])
                i+=1

# for index, row in df.iterrows():
#         for line in (row):
#             for location in locations:
#                 if location in line:
#                     print(location)


(479, 4)
Meethotamulla
Colombo
Galle
Colombo
Colombo
Colombo
Meethotamulla
Colombo
Colombo
Colombo
Gampola
Colombo
Colombo
Kalubowila
Colombo
Colombo
Galle
Matara
Colombo
Colombo
Colombo
Meethotamulla
Colombo
Puttalam
Colombo
Meethotamulla
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Galle
Elpitiya
Colombo
Meethotamulla
Colombo
Puttalam
Colombo
Meethotamulla
Colombo
Puttalam
Colombo
Galle
Colombo
Meethotamulla
Colombo
Colombo
Colombo
Hatton
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Matara
Colombo
Colombo
Colombo
Anuradhapura
Galle
Colombo
Colombo
Galle
Matara
Colombo
Colombo
Colombo
Meethotamulla
Colombo
Puttalam
Matara
Matara
Colombo
Colombo
Colombo
Galle
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Meethotamulla
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Colombo
Meethotamulla
Colombo
Meethotamulla
Colombo
Colombo
Puttalam
Meethotamulla
Colombo
Colombo
Meethotamulla
Colombo
Meethotamulla
Colombo
C