In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
from textacy import preprocessing
import pandas as pd
import numpy as np
import re

In [4]:
drivers = pd.read_excel("drivers_of_change.xlsx")

In [5]:
statement = """Ms. Shinen explained that her views would be biased towards water, given that she is a member of the DOS’s Water Team. With respect to water, she felt that the major challenges in the Mekong were the very rapid pace of development; hydropower development; and a lack of coordination. She also raised concern around the profusion of cooperation platforms in the region which, she said, are a hindrance to effective cooperation and coordination between the Mekong countries.

Water issues are political issues as is natural resource management, typically decided at high level. There are major challenges all around natural resources management in the basin. These have, she argued, been politicised, and there is currently no real development path going forwards. As a consequence, Ms. Shinen felt that the introduction of a new cooperation mechanism was not the answer. She suggested that WB make a detailed investigation of existing mechanisms to see what does or does not work, and then use its leverage to strengthen or improve these. There is lack of coherent coordination between cooperative mechanisms. 

Stronger leadership is required, along with coordination from development partners (DPs), rather than creating a new platform. DOS, she said, monitors events in the Mekong through a political lens, looking for what they call ‘conservation crimes’. Actor consultation is also determined through a political lens. She summarised the DOS’s engagement with a variety of international NGOs, including IUCN, WWF and Conservation International. ACMEC, the MRC and Friends of the Mekong and an ASEAN may be the best options and warrant further examination."""

          
statement = re.sub("\n"," ",statement)
statement = preprocessing.normalize_whitespace(statement)
statement

'Ms. Shinen explained that her views would be biased towards water, given that she is a member of the DOS’s Water Team. With respect to water, she felt that the major challenges in the Mekong were the very rapid pace of development; hydropower development; and a lack of coordination. She also raised concern around the profusion of cooperation platforms in the region which, she said, are a hindrance to effective cooperation and coordination between the Mekong countries. Water issues are political issues as is natural resource management, typically decided at high level. There are major challenges all around natural resources management in the basin. These have, she argued, been politicised, and there is currently no real development path going forwards. As a consequence, Ms. Shinen felt that the introduction of a new cooperation mechanism was not the answer. She suggested that WB make a detailed investigation of existing mechanisms to see what does or does not work, and then use its lev

In [6]:
doc = nlp(statement)

In [7]:
import pandas as pd
pst = pd.DataFrame([[w.text,w.tag_,w.pos_, w.dep_] for w in  doc])
pst.columns = ['Text', "Part_of_Speech", "Speech", "Dependency"]
pst

Unnamed: 0,Text,Part_of_Speech,Speech,Dependency
0,Ms.,NNP,PROPN,compound
1,Shinen,NNP,PROPN,nsubj
2,explained,VBD,VERB,ROOT
3,that,IN,SCONJ,mark
4,her,PRP$,DET,poss
...,...,...,...,...
293,and,CC,CCONJ,cc
294,warrant,VBP,VERB,conj
295,further,JJ,ADJ,amod
296,examination,NN,NOUN,dobj


In [8]:
from spacy import displacy

displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [9]:
ent_list = []
for ent in doc.ents:
    ent_list.append(ent.text)

In [10]:
ent_list

['Shinen',
 'DOS’s Water Team',
 'Mekong',
 'Mekong',
 'Shinen',
 'WB',
 'DOS',
 'Mekong',
 'DOS',
 'IUCN',
 'WWF',
 'MRC',
 'Friends of the Mekong',
 'ASEAN']

In [11]:
sentences = []
possible_challenges = []
for sent in doc.sents:
    sentences.append(sent.text)
    for token in sent:
        if token.pos_  == "NOUN":
            possible_challenges.append(token.text)
        if token.pos_ == "PROPN":
            continue 
        if (token.text == "water") | (token.text == "region") :
            continue
        else:
            continue

In [12]:
sentences = []
possible_challenges1 = []
for sent in doc.sents:
    sentences.append(sent.text)
    for chunk in sent.noun_chunks:
        if chunk.root.text in ent_list:
            continue
        if chunk.root.pos_ == "PROPN":
            continue
        if chunk.root.pos_ == "PRON":
            continue
        if chunk.root.dep_ == "nsubjpass":
            continue
        if chunk in possible_challenges1:
            continue
        if chunk.root.text in ["region", "water","challenges"]:
            continue
        else:
            possible_challenges1.append(chunk)


In [13]:
possible_challenges1

[a member,
 respect,
 the very rapid pace,
 development,
 hydropower development,
 a lack,
 coordination,
 concern,
 the profusion,
 cooperation platforms,
 a hindrance,
 cooperation,
 coordination,
 the Mekong countries,
 Water issues,
 political issues,
 natural resource management,
 high level,
 natural resources management,
 the basin,
 no real development path,
 a consequence,
 the introduction,
 a new cooperation mechanism,
 the answer,
 a detailed investigation,
 existing mechanisms,
 its leverage,
 lack,
 coherent coordination,
 cooperative mechanisms,
 coordination,
 development partners,
 a new platform,
 a political lens,
 a political lens,
 engagement,
 a variety,
 the best options,
 further examination]

In [14]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Shinen 4 10 PERSON
DOS’s Water Team 101 117 ORG
Mekong 184 190 GPE
Mekong 455 461 GPE
Shinen 784 790 PERSON
WB 888 890 ORG
DOS 1235 1238 ORG
Mekong 1273 1279 GPE
DOS 1439 1442 ORG
IUCN 1504 1508 ORG
WWF 1510 1513 ORG
MRC 1557 1560 ORG
Friends of the Mekong 1565 1586 ORG
ASEAN 1594 1599 ORG


In [16]:
drivers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Drivers      49 non-null     object
 1   Definitions  49 non-null     object
dtypes: object(2)
memory usage: 912.0+ bytes


In [17]:
drivers['Drivers'].to_list()

['River bank erosion ',
 'River flow (velocity) ',
 'Agro-chemical use ',
 'Change in sediment load  ',
 'Change in wetlands and floodplains ',
 'Endangered fish species ',
 'Forest loss ',
 'Waste water release ',
 'Water quality ',
 'Water diversion for agriculture ',
 'Increase in legal and illegal fishing',
 'Wildlife conservation',
 'Climate Change',
 'Hydropower development',
 'Climate change adaptation',
 'Education ',
 'Climate change adaptation',
 'Gender equity and parity ',
 'Water resource management planning ',
 'Nutritional food security',
 'Urban planning ',
 'More local (& youth) participation in decisions',
 'Public health risk',
 'Community based organisations',
 'Human rights',
 'Local migration',
 'Agricultural production ',
 'Economic growth ',
 'Increasing household income ',
 'Increased mono-plantations ',
 'Mining concession increase',
 'Local employment increase ',
 'More public/foreign investment',
 'Production for export and trade ',
 'Tourism services and si