In [2]:
import re

In [3]:
import pandas as pd

In [4]:
import csv

In [5]:
# read csv containing Dengue articles to a dataframe
df = pd.read_csv('TestDengueContent.csv')

In [6]:
# Fill missing values
df = df.fillna("")

In [7]:
print(df.shape)

(479, 4)


In [8]:
# drops duplicate entries
df.drop_duplicates(subset=None, keep='first', inplace=True)

In [9]:
print(df.shape)

(478, 4)


In [10]:
# Regex for publication date ####-##-##
regex_for_publication_date = r"\d{4}-\d{2}-\d{2}"

In [11]:
# a number following 'dengue' or 'cases' or 'suspected dengue cases' or 'suspected cases'
regex_for_cases = re.compile(r'\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue\scases|dengue\sfever\scases|dengue|cases|suspected\sdengue\scases|suspected\scases)', re.IGNORECASE)

In [12]:
# regex for deaths or patients(TODO move patients to no.of cases)
regex_for_deaths = re.compile(r"""
(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|
(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|
(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))
\s*
(?:deaths|death|dead(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead)|patients|dengue\srelated\deaths)
""", re.VERBOSE | re.IGNORECASE)

In [13]:
# regex to extract sentences having no. of cases/deaths
# TODO handle for "i.e" 
regex_for_sentences = re.compile(r"[^.]*?(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue|cases|suspected\sdengue\scases|suspected\scases)|(?:(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))\s*(?:deaths|death|dead(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead)|patients)))[^.]*\.")

In [14]:
# regex for dates
regex_for_dates = re.compile(r"""(?:\d{4}(?:\.|/|-)\d{1,2}(?:\.|/|-)\d{1,2})|                                                                       # dates with format xxxx.xx.xx or xxxx/xx/xx or xxxx-xx-xx (month and date: 1 or 2 digits)
(?:\d{1,2}(?:\.|/|-)\d{1,2}(?:\.|/|-)\d{2,4})|                                                                                                      # dates with format xx.xx.xx where . or / or - used as the seperator, 1 or 2 digits for date or month, 2 or 4 digits for year
(?:(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|April|May|June|July|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s*\d{0,2})     # dates with format: month xx (date with 0-2 digits)
""", re.VERBOSE)

In [15]:
# regex_for_years 20xx
regex_for_years = re.compile(r'\b20(?:\d{2}){1}\b|this\syear')

In [16]:
# regex for months
regex_for_months = re.compile(r'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|April|May|June|July|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)')

In [17]:
# regex for "so far"
regex_for_so_far = re.compile(r'so\sfar')

In [18]:
# regex for districts
regex_for_districts = re.compile(r'Jaffna|Kilinochchi|Mannar|Mullaitivu|Vavuniya|Puttalam|Kurunegala|Gampaha|Colombo|Kalutara|Anuradhapura|Polonnaruwa|Matale|Kandy|Nuwara\sEliya|Kegalle|Ratnapura|Trincomalee|Trinco|Batticaloa|Ampara|Badulla|Monaragala|Hambantota|Matara|Galle', re.IGNORECASE)

In [19]:
# regex for provinces
regex_for_provinces = re.compile(r'Western|Central|Eastern|North\sCentral|Northern|North\sWestern|Sabaragamuwa|Southern|Uva')

In [20]:
# regex for other places/locations
regex_for_places = re.compile(r'Sri Lanka|island|country|countrywide|Meethotamulla|Negombo|Ratmalana|Dangolla|Elpitiya|Beliatte|Hatton|Marakolliya|Gampola|Kalubowila')

In [21]:
def get_sentences(text):
    return regex_for_sentences.findall(text)

In [22]:
def get_published_date(text):
    return re.search(regex_for_publication_date, text).group()

In [23]:
def get_cases(text):
    return regex_for_cases.findall(text)

In [24]:
def get_deaths(text):
    return regex_for_deaths.findall(text)       

In [25]:
def get_years(text):
    return regex_for_years.findall(text)

In [26]:
def get_months(text):
    return regex_for_months.findall(text)

In [27]:
def get_dates(text):
    return regex_for_dates.findall(text)

In [28]:
def get_so_far(text):
    return regex_for_so_far.findall(text)

In [29]:
def get_provinces(text):
    return regex_for_provinces.findall(text)

In [30]:
def get_districts(text):
    return regex_for_districts.findall(text)

In [31]:
def get_places(text):
    return regex_for_places.findall(text)

In [32]:
locations=["Meethotamulla","Colombo", "kandy","Anuradhapura","Galle","Matara","Ratmalana","Dangolla","Badulla","Elpitiya","Beliatte","Hatton","Marakolliya","Gampola","Kalubowila","Puttalam",]

In [33]:
for index, row in df.iterrows():
    print("Index: "+str(index))
#     print("Publication date: "+get_published_date(row['Date']))
    sentences = get_sentences(row['Article_content'])
#     print(sentences)
    for sentence in sentences:
        print(sentence)
#         get_data(sentence)
        
        
        print(str(index))
        print(get_published_date(row['Date']))
#         print(sentence.encode("utf-8"))
#         print(get_numbers(sentence))
        print(get_cases(sentence))
        print(get_deaths(sentence))
        print(get_years(sentence))
        print(get_months(sentence))
        print(get_dates(sentence))
        print(get_so_far(sentence))
        print(get_provinces(sentence))
        print(get_districts(sentence))
        print(get_places(sentence))
           
        
    print()

Index: 0

Index: 1

Index: 2

Index: 3

Index: 4

Index: 5

Index: 6

Index: 7

Index: 8

Index: 9

Index: 10

Index: 11

Index: 12

Index: 13

Index: 14

Index: 15

Index: 16

Index: 17

Index: 18

Index: 19
Enduring support of the respective authorities and positive contribution of the UA staff members across the branch network has led Union Manushyathwaya to successfully conduct over 50 dengue awareness programmes, over 100 thalassemia awareness programmes and 45 diabetes awareness programmes.
19
2018-02-21
['50 dengue']
[]
[]
[]
[]
[]
[]
[]
[]

Index: 20
� �Haemophilia�This is a very rare disease, with only around 2000 patients being present in all of Sri Lanka.
20
2018-02-09
[]
['2000 patients']
['2000']
[]
[]
[]
[]
[]
['Sri Lanka']

Index: 21

Index: 22

Index: 23

Index: 24

Index: 25

Index: 26

Index: 27
The intermittent rains experienced in many parts of the country right now has increased the threat of a dengue outbreak, while the number of dengue cases reported in the first

['4,500 patients', '12 deaths']
[]
['Jan', 'Feb', 'March', 'April', 'May', 'June', 'July']
['Jan ', 'Feb ', 'March ', 'April ', 'May ', 'June ', 'July']
['so far']
[]
[]
['Negombo']

Index: 121

Index: 122

Index: 123

Index: 124

Index: 125

Index: 126
 �We know the virus has taken a heavy toll in Sri Lanka with close to a 100,000 infected and 250 deaths and we are committed to helping Sri Lanka eradicate dengue once and for all,� Bishop said.
126
2017-07-20
[]
['250 deaths']
[]
[]
[]
[]
[]
[]
['Sri Lanka', 'Sri Lanka']

Index: 127

Index: 128
��The Health Ministry has warned that dengue has now entered a pandemic stage as the resurgence of the mosquito-borne menace has killed over 240 people and affected 80,000 so far in this year compared to a total of 51,000 cases reported in 2016.
128
2017-07-20
['51,000 cases']
[]
['this year', '2016']
[]
[]
['so far']
[]
[]
[]

Index: 129

Index: 130

Index: 131

Index: 132

Index: 133

Index: 134

Index: 135

Index: 136

Index: 137

Index: 138


[]
[]

Index: 205

Index: 206
 �As per statistics collected by the Epidemiology Unit, Ministry of Health Sri Lanka, it is �alarming that during the last 5 months of the year 2017, 52015 suspected dengue cases were reported from all over the island.
206
2017-06-05
['2017, 52015 suspected dengue cases']
[]
['2017']
[]
[]
[]
[]
[]
['Sri Lanka', 'island']

Index: 207

Index: 208

Index: 209

Index: 210

Index: 211
 Though there are six main wards in the Prison with nearly 300 patients, there are only three nursing officers.
211
2017-05-30
[]
['300 patients']
[]
[]
[]
[]
[]
[]
[]

Index: 212
Health Minister Rajitha Senaratne said the US Centre for Disease Control had so far sent out 900 dengue alerts globally with 400 alerts had sent out to countries in the Asian region.
212
2017-05-27
['900 dengue']
[]
[]
[]
[]
['so far']
[]
[]
[]
 Gunawardene had said there were 45,000 dengue patients in Sri Lanka up to now.
212
2017-05-27
['45,000 dengue']
[]
[]
[]
[]
[]
[]
[]
['Sri Lanka']

Index: 213



Index: 280

Index: 281

Index: 282

Index: 283

Index: 284

Index: 285

Index: 286

Index: 287

Index: 288

Index: 289

Index: 290

Index: 291

Index: 292

Index: 293

Index: 294

Index: 295
 The maternal mortality rate, which stood at 92 deaths per 100,000 live births in 1990, plummeted to 33 by 2010.
295
2015-09-30
[]
['92 deaths']
['2010']
[]
[]
[]
[]
[]
[]
 Old diseases like tuberculosis (8,000 cases per year) stubbornly persist.
295
2015-09-30
['8,000 cases']
[]
[]
[]
[]
[]
[]
[]
[]

Index: 296

Index: 297
 The first nine months of this year recorded 20,058 cases; last year, a peak year for dengue, saw close to 29,000 cases over the same period, according to Health Ministrydata.
297
2015-09-23
['20,058 cases', '29,000 cases']
[]
['this year']
[]
[]
[]
[]
[]
[]

Index: 298

Index: 299

Index: 300

Index: 301

Index: 302

Index: 303

Index: 304

Index: 305

Index: 306

Index: 307

Index: 308

Index: 309

Index: 310

Index: 311

Index: 312

Index: 313

Index: 314

Index: 315
 In 2014

Index: 443
 Referring to the situation in the school where 20 dengue cases were reported, he said it was found that stagnated water in the basement of the school auditorium may have caused it.
443
2010-08-08
['20 dengue cases']
[]
[]
[]
[]
[]
[]
[]
[]

Index: 444

Index: 445

Index: 446

Index: 447

Index: 448

Index: 449

Index: 450

Index: 451

Index: 452

Index: 453

Index: 454

Index: 455

Index: 456

Index: 457

Index: 458

Index: 459

Index: 460
 		This year so far, 158 dengue related deaths have been reported while more than 21,000 dengue cases have been reported countrywide.
460
2010-07-21
['158 dengue', '21,000 dengue cases']
[]
[]
[]
[]
['so far']
[]
[]
['country']

Index: 461

Index: 462
Papaya leaf juice mixed with honey will increase the blood platelet level of dengue patients, claimed a doctor who had treated 20 patients�using this method.
462
2010-07-18
[]
['20 patients']
[]
[]
[]
[]
[]
[]
[]
 Ameen, a doctor at the Welipitiya ayurveda hospital has treated 20 patients us