In [1]:
import urllib.request, json

with urllib.request.urlopen(
    "https://api.steinhq.com/v1/storages/5e736c1db88d3d04ae0815b3/Raw_Data"
) as url:
    data = json.loads(url.read().decode())
    

In [2]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [3]:
df = pd.DataFrame(data)

In [4]:
df["Notes"][30:35]

30               Travelled from Thailand and Malaysia
31                                Travelled from Iran
32                                Travelled from Iran
33                                Travelled from Oman
34    Travelled from Italy on 29/02/2020 through Doha
Name: Notes, dtype: object

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [6]:
from spacy.tokens import Span

def get_travel_status(span):
    if span.label_ =="GPE":
        prev_token = span.doc[span.start - 1]
        if prev_token.text in ("from", "through", "via", "Via"):
            return("from")
        elif prev_token.text in ("to", "and"):
            return("to")
        return "to"

# Register the Span extension as 'travel_status'
Span.set_extension("travel_status", getter=get_travel_status, force=True)

In [7]:
from spacy.tokens import Span, Token

def get_nat(span):
    if span.label_ =="NORP":
        return span.text

# Register the Span extension as 'nationality'
Span.set_extension("nationality", getter=get_nat, force=True)

doc = nlp("Indian Tourist")
print([(ent.text, ent._.nationality) for ent in doc.ents])

[('Indian', 'Indian')]


In [8]:
from spacy.matcher import Matcher

In [9]:
def get_rel(token):
    if token.text == "of":
        prev_token = token.doc[token.i - 1]
        prev2 = None
        if token.i > 2:
            prev2 = token.doc[token.i - 2]
            if prev2.text.lower() == "and":
                return f"{token.doc[token.i - 3]} {token.doc[token.i - 2]} {token.doc[token.i - 1]}"
        if prev_token.text.lower() in ("members", "member"):
            return "Family Member"
        else:
            return prev_token.text


# Register the Span extension as 'relationship'
Token.set_extension("relationship", getter=get_rel, force=True)

doc = nlp("friend and family of p23")
print([(ent.text, ent._.relationship) for ent in doc])

[('friend', None), ('and', None), ('family', None), ('of', 'friend and family'), ('p23', None)]


In [10]:
def extract_relationship(sent):
    if not sent:
            return []
    s = re.sub(r'[^\w\s]',' ',sent)
    doc = nlp(s)
    for tok in doc:
        if tok._.relationship:
            return(tok._.relationship)

In [11]:
def extract_travel_place(sent):
    if not sent:
            return []
    s = re.sub(r'[^\w\s]',' ',sent)
    doc = nlp(s)
    travel = []
    for ent in doc.ents:
        if ent._.travel_status:
            travel.append(ent.text)
    return travel

In [12]:
def extract_nationality(sent):
    if not sent:
            return []
    s = re.sub(r'[^\w\s]',' ',sent)
    doc = nlp(s)
    nat = []
    for ent in doc.ents:
        if ent._.nationality:
            nat.append(ent._.nationality)
    return nat

In [13]:
import urllib.request, json

with urllib.request.urlopen(
    "https://raw.githubusercontent.com/bhanuc/indian-list/master/state-city.json"
) as url:
    state_city = json.loads(url.read().decode())

    

In [14]:
l = ["India", "Mumbai"]
for k, v in state_city.items():
    l.append(k)
    l = l+v

In [15]:
l= [ele.replace("*", "") for ele in l]

In [16]:
def extract_foreign(sent):
    if not sent:
            return []
    s = re.sub(r'[^\w\s]',' ',sent)
    doc = nlp(s)
    is_foreign = []
    for ent in doc.ents:
        if ent.label_=="GPE":
            is_foreign.append(not(ent.text in l))
    return is_foreign

extract_foreign("India, China Italy, Japan, Pune, 1989 mountains Apple Meghana")

[False, True, True, True, False]

In [17]:
import re
def find_travelled(data):
    df = data.copy()
    df["Relationship"] = df["Notes"].progress_apply(extract_relationship)
    df["Travel Place"] = df["Notes"].progress_apply(extract_travel_place)
    df["Nationality"] = df["Notes"].progress_apply(extract_nationality)
    df["is_foreign"] = df["Notes"].progress_apply(extract_foreign)
    return df
    

In [18]:
find_travelled(df).to_csv("rel.csv")

100%|██████████| 1040/1040 [00:03<00:00, 327.28it/s]
100%|██████████| 1040/1040 [00:03<00:00, 330.57it/s]
100%|██████████| 1040/1040 [00:03<00:00, 334.87it/s]
100%|██████████| 1040/1040 [00:03<00:00, 338.28it/s]
