In [1]:
import re

In [2]:
import pandas as pd

In [3]:
import csv

In [4]:
# read csv containing Dengue articles to a dataframe
df = pd.read_csv('new_articles.csv')

In [5]:
# Fill missing values
df = df.fillna("")

In [6]:
print(df.shape)

(30, 3)


In [7]:
# drops duplicate entries
df.drop_duplicates(subset=None, keep='first', inplace=True)

In [8]:
print(df.shape)

(30, 3)


In [9]:
# Regex for publication date ####-##-##
regex_for_publication_date = r"\d{4}-\d{2}-\d{2}"

In [10]:
regex_for_numbers = re.compile(r'\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*');

In [11]:
# a number following 'cases' or 'suspected dengue cases' or 'suspected cases'
regex_for_cases = re.compile(r'\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue\scases|dengue\sfever\scases|cases|suspected\sdengue\scases|suspected\scases|suspected\sdengue\sfever\scases)', re.IGNORECASE)

In [12]:
# regex for deaths or patients(TODO move patients to no.of cases)
regex_for_deaths = re.compile(r"""
(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|
(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|
(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))
\s*
(?:deaths|death|dead\b|(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead\b)|patients|dengue\srelated\sdeath)

""", re.VERBOSE | re.IGNORECASE)

In [13]:
# regex to extract sentences having no. of cases/deaths
# TODO handle for "i.e" 
regex_for_sentences = re.compile(r"[^.]*?(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue\scases|dengue\sfever\scases|cases|suspected\sdengue\scases|suspected\scases|suspected\sdengue\sfever\scases)|(?:(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))\s*(?:deaths|death|dead\b|(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead\b)|patients|dengue\srelated\sdeath)))[^.]*\.")

In [14]:
# regex for dates
regex_for_dates = re.compile(r"""(?:\d{4}(?:\.|/|-)\d{1,2}(?:\.|/|-)\d{1,2})|                                                                       # dates with format xxxx.xx.xx or xxxx/xx/xx or xxxx-xx-xx (month and date: 1 or 2 digits)
(?:\d{1,2}(?:\.|/|-)\d{1,2}(?:\.|/|-)\d{2,4})|                                                                                                      # dates with format xx.xx.xx where . or / or - used as the seperator, 1 or 2 digits for date or month, 2 or 4 digits for year
(?:(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|April|May|June|July|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s*\d{0,2})     # dates with format: month xx (date with 0-2 digits)
""", re.VERBOSE)

In [15]:
# regex_for_years 20xx
regex_for_years = re.compile(r'\b20(?:\d{2}){1}\b|this\syear')

In [16]:
# regex for months
regex_for_months = re.compile(r'Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|April|May|June|July|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)')

In [17]:
# regex for "so far"
regex_for_so_far = re.compile(r'so\sfar')

In [18]:
# regex for districts
regex_for_districts = re.compile(r'Jaffna|Kilinochchi|Mannar|Mullaitivu|Vavuniya|Puttalam|Kurunegala|Gampaha|Colombo|Kalutara|Anuradhapura|Polonnaruwa|Matale|Kandy|Nuwara\sEliya|Kegalle|Ratnapura|Trincomalee|Trinco|Batticaloa|Ampara|Badulla|Monaragala|Hambantota|Matara|Galle', re.IGNORECASE)

In [19]:
# regex for provinces
regex_for_provinces = re.compile(r'Western|Central|Eastern|North\sCentral|Northern|North\sWestern|Sabaragamuwa|Southern|Uva')

In [20]:
# regex for other places/locations
regex_for_places = re.compile(r'Sri Lanka|island|country|countrywide|Meethotamulla|Negombo|Ratmalana|Dangolla|Elpitiya|Beliatte|Hatton|Marakolliya|Gampola|Kalubowila')

In [21]:
def get_sentences(text):
    return regex_for_sentences.findall(text)

In [22]:
def get_published_date(text):
    return re.search(regex_for_publication_date, text).group()

In [23]:
def get_numbers(text):
    return regex_for_numbers.findall(text)

In [24]:
def get_cases(text):
    return regex_for_cases.findall(text)

In [25]:
def get_deaths(text):
    return regex_for_deaths.findall(text)       

In [26]:
def get_years(text):
    return regex_for_years.findall(text)

In [27]:
def get_months(text):
    return regex_for_months.findall(text)

In [28]:
def get_dates(text):
    return regex_for_dates.findall(text)

In [29]:
def get_so_far(text):
    return regex_for_so_far.findall(text)

In [30]:
def get_provinces(text):
    return regex_for_provinces.findall(text)

In [31]:
def get_districts(text):
    return regex_for_districts.findall(text)

In [32]:
def get_places(text):
    return regex_for_places.findall(text)

In [33]:
locations=["Meethotamulla","Colombo", "kandy","Anuradhapura","Galle","Matara","Ratmalana","Dangolla","Badulla","Elpitiya","Beliatte","Hatton","Marakolliya","Gampola","Kalubowila","Puttalam",]

In [34]:
articles=[]

In [35]:
regex_for_sentences = re.compile(r"[^.]*?(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue|dengue\scases|dengue\sfever\scases|cases|suspected\sdengue\scases|suspected\scases|suspected\sdengue\sfever\scases)|(?:(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))\s*(?:deaths|death|dead\b|(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead\b)|patients|dengue\srelated\sdeath)))[^.]*\.")

In [36]:
regex_for_sentences1 = re.compile(r"[^.]*?(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*\s*(?:dengue|cases|suspected\sdengue\scases|suspected\scases)|(?:(?:(?:\d+(?:(?:,|\s|(?:,\s)|(?:\s,))\d+)*)|(?:one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|(?:(?:(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)\s*)(?:one|two|three|four|five|six|seven|eight|nine)?)))\s*(?:deaths|death|dead(?:people\s*|patients\s*)?(?:have\s*|had\s*)?(?:died|dead)|patients)))[^.]*\.")

In [38]:
for index, row in df.iterrows():
    print(regex_for_sentences.findall(row['Article_Content']))
#     print("*")
#     print(regex_for_sentences1.findall(row['Article_content']))
    print()
    

[]

[]

[]

[]

['� �She elaborated on the status of dengue in our country in contrast to the preceding years, she said, �dengue has reduced significantly, from�150,000 (January- September 2017) to 38000 cases this year ( January- September 2018).', '�� �Speaking of dengue management in hospitals in our country she said, �Usually in our country the fatality rate of dengue is very low, last year we had 186 000 dengue patients but 440 deaths.']

[]

[]

[]

[]

[]

[]

[]

[]

[]

[]

[]

['2% in 2007) and with 35,008 cases reported in 2009 with 346 deaths, according to the Epidemiology Unit of the Health Ministry.']

[' In the year 2008 Sri Lanka reported the largest outbreak of leptospirosis with 7423 suspected cases and 204 deaths.', ' Subsequently, 4980 cases and 145 deaths were reported in 2009 and in 2010 the number of cases was 4553 with 121 dying.', ' Theshanthi Welivitiya said they had carried out plasma exchanges for more than 150 patients since 2016 with pulmonary haemorrhage 

In [45]:
articles=[]
for index, row in df.iterrows():
    print("Index: "+str(index))
#     print("Publication date: "+get_published_date(row['Date']))
    sentences = get_sentences(row['Article_Content'])
#     print(sentences)
    for sentence in sentences:
        print(sentence)
#         get_data(sentence)
        
        articles.append({
                'Index':str(index),
                'Publication_date':get_published_date(row['Date']),
                'Sentence':sentence.encode("utf-8"),
                'Numbers':get_numbers(sentence),
                'Cases':get_cases(sentence),
                'Deaths':get_deaths(sentence),
                'Years':get_years(sentence),
                'Months':get_months(sentence),
                'Dates': get_dates(sentence),
                'Time_duration':get_so_far(sentence),
                'Provinces':get_provinces(sentence),
                'Districts':get_districts(sentence),
                'Places':get_places(sentence),
            })
        
#     print()

Index: 0
Index: 1
Index: 2
Index: 3
Index: 4
� �She elaborated on the status of dengue in our country in contrast to the preceding years, she said, �dengue has reduced significantly, from�150,000 (January- September 2017) to 38000 cases this year ( January- September 2018).
�� �Speaking of dengue management in hospitals in our country she said, �Usually in our country the fatality rate of dengue is very low, last year we had 186 000 dengue patients but 440 deaths.
Index: 5
Index: 6
Index: 7
Index: 8
Index: 9
Index: 10
Index: 11
Index: 12
Index: 13
Index: 14
Index: 15
Index: 16
2% in 2007) and with 35,008 cases reported in 2009 with 346 deaths, according to the Epidemiology Unit of the Health Ministry.
Index: 17
 In the year 2008 Sri Lanka reported the largest outbreak of leptospirosis with 7423 suspected cases and 204 deaths.
 Subsequently, 4980 cases and 145 deaths were reported in 2009 and in 2010 the number of cases was 4553 with 121 dying.
 Theshanthi Welivitiya said they had carri

In [46]:
# write data in the 'article' list to a csv
with open('Regex_output_new.csv',mode='w') as csv_file:
    fieldnames=['Index','Publication_date','Sentence','Numbers','Cases','Deaths','Years','Months','Dates','Time_duration','Provinces','Districts','Places']
    writer=csv.DictWriter(csv_file,fieldnames=fieldnames)

    writer.writeheader()
    for article in articles:
        writer.writerow({'Index':article['Index'],'Publication_date':article['Publication_date'],'Sentence':article['Sentence'],'Numbers':article['Numbers'],'Cases':article['Cases'],'Deaths':article['Deaths'],'Years':article['Years'],'Months':article['Months'],'Dates':article['Dates'],'Time_duration':article['Time_duration'],'Provinces':article['Provinces'],'Districts':article['Districts'],'Places':article['Places']})


In [37]:
# import re

# locations=["Meethotamulla","Colombo", "kandy","Anuradhapura","Galle","Matara","Ratmalana","Dangolla","Badulla","Elpitiya","Beliatte","Hatton","Marakolliya","Gampola","Kalubowila","Puttalam",]

# import pandas as pd
# import csv

# df = pd.read_csv('TestDengueContent.csv')
# df = df.fillna("")
# print(df.shape)

# for index, row in df.iterrows():
#         for line in (row):
#             i=0
#             while i<len(locations):
#                 if locations[i] in line:
#                     print(locations[i])
#                 i+=1

# # for index, row in df.iterrows():
# #         for line in (row):
# #             for location in locations:
# #                 if location in line:
# #                     print(location)


In [40]:
import nltk

In [41]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [42]:
data = "Contamination of the food chain and waterways continue to pose serious public health risks in many countries including Sri Lanka.		Moreover, the spread of unhealthy food habits, often propagated by big multinationals, has become a public health menace not just in developing countries but in the most developed parts of the world as well, i.e. America, Europe, Australasia and elsewhere.Recognizing the wider background to growing public health risks in different parts of the world, the WHO commissioned and publishedï¿½ several years ago a well-articulated report on Social Determinants of Health (SDH) . But 12.4 percent people died. "


In [43]:
print(sent_tokenize(data))

['Contamination of the food chain and waterways continue to pose serious public health risks in many countries including Sri Lanka.', 'Moreover, the spread of unhealthy food habits, often propagated by big multinationals, has become a public health menace not just in developing countries but in the most developed parts of the world as well, i.e.', 'America, Europe, Australasia and elsewhere.Recognizing the wider background to growing public health risks in different parts of the world, the WHO commissioned and publishedï¿½ several years ago a well-articulated report on Social Determinants of Health (SDH) .', 'But 12.4 percent people died.']
