In [None]:
import requests
import pandas as pd
import re
from collections import OrderedDict
from googletrans import Translator
from mechanize import Browser
from nltk.chunk.regexp import RegexpParser
from nltk.tree import Tree
from nltk import pos_tag, word_tokenize
from gensim.models import KeyedVectors
from bs4 import BeautifulSoup

model = KeyedVectors.load_word2vec_format("~/GoogleNews-vectors-negative300.bin", binary=True)
entityID = "Q668"


In [None]:
def makeSparqlRequest(query):
    url = 'https://query.wikidata.org/sparql'
    r = requests.get(url, params = {'format': 'json', 'query': query})
    data = r.json()
    return data

spoQuery = """

SELECT ?p ?pHi ?pred ?object WHERE {
  wd:""" + entityID + """ ?pred ?o .

  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "hi" .
    ?o rdfs:label ?object .
  }
  ?prop wikibase:directClaim ?pred .
   SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en" .
    ?prop rdfs:label ?p .
  }
  
  ?propHi wikibase:directClaim ?pred .
   SERVICE wikibase:label {
    bd:serviceParam wikibase:language "hi" .
    ?propHi rdfs:label ?pHi .
  }
  FILTER(LANG(?object) = "hi")

}
"""

results = makeSparqlRequest(spoQuery)

In [None]:
entityIDquery = """
SELECT * WHERE {
  wd:""" + entityID + """ rdfs:label ?label.
  FILTER(LANGMATCHES(LANG(?label), "hi"))
}
LIMIT 1
"""

results2 = makeSparqlRequest(entityIDquery)
entityName = results2["results"]["bindings"][0]["label"]["value"]
entityName

In [None]:
triplets = []

for item in results['results']['bindings']:
    obj = item['object']['value']
    translator = Translator()
    if item["pHi"]["value"][0] == "P":
        pred = item['p']["value"]
        predHi = translator.translate(pred, dest="hi").text
    else:
        predHi = item['pHi']["value"]
    triplets.append(OrderedDict({
        'subject': entityName,
        'predicate': predHi,
        "predicateEn": item['p']["value"],
        "propertyID": item['pred']["value"],
        'object': obj}))

df = pd.DataFrame(triplets)


In [None]:
def posTagHindi(text):
    br = Browser()
    br.open("http://taghindi.herokuapp.com/")
    br.form = br.forms()[0]
    br["text"] = text
    response = br.submit().read()

    soup = BeautifulSoup(response.decode("utf-8"))

    tags = [el.text for el in soup.find_all('span', {'style': 'color:blue'})]
    taggedText = list(zip(text.split(), tags))
    
    return taggedText

predicates = list(set([triplet["predicate"] for triplet in triplets]))
wikiDict = {pred: [] for pred in predicates}



In [None]:
def getSim(a,b):
    try:
        return model.similarity(a,b)
    except:
        return 0
    
def getInstances(text):
    grammar = """
        PRE:   {<NNS|NNP|NN|NP|JJ|UH>+}
        INSTANCE:   {(<JJ+>)?<PRE>}
    """
    chunker = RegexpParser(grammar)
    taggedText = pos_tag(word_tokenize(text))
    textChunks = chunker.parse(taggedText)
    current_chunk = []
    for i in textChunks:
        if (type(i) == Tree and i.label() == "INSTANCE"):
            # print (i.leaves())
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
    return current_chunk

topics = ["history", "geography", "politics"]
for triplet in triplets:
    instances = getInstances(triplet["predicateEn"])
    sectionIdx = 0
    if instances:
        instance = "_".join(instances[-1].split(" "))
        s1 = getSim(instance, topics[0])
        s2 = getSim(instance, topics[1])
        s3 = getSim(instance, topics[2])
        cutoff = [el for el in [s1,s2,s3] if el>0.14]
        if cutoff:
            sectionIdx = [s1,s2,s3].index(cutoff[0]) + 1
    wikiDict[triplet["predicate"]].append((triplet["object"], sectionIdx))


In [None]:
article = ""
articleSections = ["", "इतिहास\n\n", "भूगोल\n\n", "राजनीति\n\n"]
for verb_str in wikiDict:
    taggedElem = posTagHindi(verb_str)
    sentence = ""
    subject_str = entityName
    sectionNum = wikiDict[verb_str][0][1]
    if len(wikiDict[verb_str])>1:
        object_str = ", ".join([el[0] for el in wikiDict[verb_str][:-1]]) + " और " + wikiDict[verb_str][-1][0]
    else:
        object_str = wikiDict[verb_str][0][0]
    
    if taggedElem[-1] == "VM":
        sentence = " ".join([subject_str, object_str, "का", verb_str])
    elif taggedElem[-1] == "VAUX":
        sentence = " ".join([subject_str, object_str, "के साथ", verb_str])
    elif "NN" in taggedElem[-1] or taggedElem[-1] == "XC":
        sentence = " ".join([subject_str, "का", verb_str,  object_str])
    else:
        sentence = " ".join([subject_str, object_str, verb_str])
        

    if sentence.split(" ")[-1] != "है":
        sentence += " है"

    sentence += "| "
    articleSections[sectionNum] += sentence

del articleSections[2]


In [None]:
article = "\n\n\n\n".join(articleSections)
open("article.txt","w+").write(article)