## Preprocess data

In [122]:
import pandas as pd
import json

top_relations = ['www.freebase.com/music/genre/albums',
                'www.freebase.com/time/event/locations',
                'www.freebase.com/location/location/time_zones']

rel_mappings = {'www.freebase.com/music/genre/albums' : ['http://dbpedia.org/ontology/genre'],
                'www.freebase.com/time/event/locations' : ['http://dbpedia.org/ontology/place'],
                'www.freebase.com/location/location/time_zones': ['http://dbpedia.org/ontology/timeZone',
                                                                 "http://dbpedia.org/property/timezone"]}

In [123]:
train = pd.read_csv("../data/SimpleQuestions/annotated_fb_data_train.txt", sep='\t', header=None)
test = pd.read_csv("../data/SimpleQuestions/annotated_fb_data_test.txt", sep='\t', header=None)
val = pd.read_csv("../data/SimpleQuestions/annotated_fb_data_valid.txt", sep='\t', header=None)

In [124]:
train = train[train[1].isin(top_relations)]
test = test[test[1].isin(top_relations)]
val = val[val[1].isin(top_relations)]

print(train.shape, test.shape, val.shape)

(1256, 4) (376, 4) (182, 4)


## Train a classifier

In [125]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [126]:
def preprocess_text(text, remove_stopwords=False):
    # clean_ascii
    tmp = "".join(i for i in text if ord(i) < 128)
    #lowercase
    tmp = tmp.lower()
    #normal form
    tokens = tokenizer.tokenize(tmp)
    #stopwords
    if remove_stopwords:
        prep_text = ' '.join(t for t in tokens if t not in stopwords)
    else:
        prep_text = ' '.join(t for t in tokens)
    
    return prep_text

In [127]:
%%time

train['clean text'] = train[3].apply(lambda x: preprocess_text(x))
test['clean text'] = test[3].apply(lambda x: preprocess_text(x))
val['clean text'] = val[3].apply(lambda x: preprocess_text(x))

CPU times: user 203 ms, sys: 85 µs, total: 203 ms
Wall time: 201 ms


In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

seed = 42

In [129]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))
vectorizer.fit(train.append(test).append(val)['clean text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [130]:
encoder = LabelEncoder()
encoder.fit(train.append(test).append(val)[1])

LabelEncoder()

In [131]:
print("Total number of unique words in data = {0}".format(len(list(vectorizer.vocabulary_.keys()))))

Total number of unique words in data = 261


In [132]:
X_train = vectorizer.transform(train['clean text'])
y_train = encoder.transform(train[1])

In [133]:
classifier = LogisticRegression(random_state=seed).fit(X_train, y_train)

In [134]:
X_test = vectorizer.transform(test['clean text'])
y_test = encoder.transform(test[1])

In [135]:
y_pred = classifier.predict(X_test)

In [136]:
print(classification_report(y_test, y_pred,target_names = encoder.inverse_transform([0, 1, 2,])))

                                               precision    recall  f1-score   support

www.freebase.com/location/location/time_zones       0.99      1.00      1.00       117
          www.freebase.com/music/genre/albums       0.99      1.00      1.00       221
        www.freebase.com/time/event/locations       1.00      0.92      0.96        38

                                  avg / total       0.99      0.99      0.99       376



  if diff:


In [137]:
train

Unnamed: 0,0,1,2,3,clean text
96,www.freebase.com/m/054rw,www.freebase.com/location/location/time_zones,www.freebase.com/m/02q_y6,what time zone is marrakech in?,what time zone is marrakech in
164,www.freebase.com/m/02w4v,www.freebase.com/music/genre/albums,www.freebase.com/m/0b6lpfh,what album has folk music?,what album has folk music
301,www.freebase.com/m/05w3f,www.freebase.com/music/genre/albums,www.freebase.com/m/0v3bbn7,What's an example of a psychedelic rock album,what s an example of a psychedelic rock album
311,www.freebase.com/m/02x8m,www.freebase.com/music/genre/albums,www.freebase.com/m/0wjxyxt,What is an example of a funk album?,what is an example of a funk album
454,www.freebase.com/m/02psjmw,www.freebase.com/location/location/time_zones,www.freebase.com/m/02fqwt,What time zone does the chebanse township use,what time zone does the chebanse township use
567,www.freebase.com/m/016clz,www.freebase.com/music/genre/albums,www.freebase.com/m/01kzr4n,what is an alternative rock album?,what is an alternative rock album
601,www.freebase.com/m/01lyv,www.freebase.com/music/genre/albums,www.freebase.com/m/01hj4dx,What's a country album by johnny cash,what s a country album by johnny cash
648,www.freebase.com/m/01fh36,www.freebase.com/music/genre/albums,www.freebase.com/m/02r1746,what's a jazz fusion by pierre moerlen's gong,what s a jazz fusion by pierre moerlen s gong
651,www.freebase.com/m/036b_,www.freebase.com/location/location/time_zones,www.freebase.com/m/0frmgl,which time zone is guinea-bissau in?,which time zone is guinea bissau in
674,www.freebase.com/m/0155w,www.freebase.com/music/genre/albums,www.freebase.com/m/043q9jm,What is the name of a blues album,what is the name of a blues album


In [142]:
text = preprocess_text("what can i see in berlin?")
vector = vectorizer.transform([text])
prediction = classifier.predict(vector)
class_ = encoder.inverse_transform(prediction)

predicate = rel_mappings[class_[0]][0]
print(predicate)

http://dbpedia.org/ontology/genre


  if diff:


In [87]:
import requests

response = requests.get(url="https://api.dbpedia-spotlight.org/en/annotate",
                            params={"text": preprocess_text(text, True), "confidence": "0.3"},
                            headers={'accept': 'application/json'}).json()

In [88]:
response['Resources']

[{'@URI': 'http://dbpedia.org/resource/List_of_WWE_pay-per-view_events',
  '@support': '3709',
  '@types': '',
  '@surfaceForm': 'event',
  '@offset': '0',
  '@similarityScore': '0.6635543720903484',
  '@percentageOfSecondRank': '0.30828768305323'},
 {'@URI': 'http://dbpedia.org/resource/Italy',
  '@support': '150965',
  '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country',
  '@surfaceForm': 'italia',
  '@offset': '17',
  '@similarityScore': '0.9999997860977687',
  '@percentageOfSecondRank': '0.0'},
 {'@URI': 'http://dbpedia.org/resource/Roman_Empire',
  '@support': '30520',
  '@types': 'Wikidata:Q6256,Schema:Place,Schema:Country,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location,DBpedia:Country',
  '@surfaceForm': 'roman empire',
  '@offset': '24',
  '@similarityScore': '0.9990581887699139',
  '@percentageOfSecondRank': '4.644933348495679E-4'}]

In [89]:
subject = [uri['@URI'] for uri in response['Resources'] if len(uri['@types']) > 0][0]
print(subject)

http://dbpedia.org/resource/Italy


In [90]:
sparql = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dbo: <http://dbpedia.org/ontology/>

    SELECT ?a
    WHERE
    { 
      OPTIONAL
      {
       ?a <%s> <%s> .
      }
    }
""" % (predicate, subject)

In [91]:
sparql

'\n    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n    PREFIX dbo: <http://dbpedia.org/ontology/>\n\n    SELECT ?a\n    WHERE\n    { \n      OPTIONAL\n      {\n       ?a <http://dbpedia.org/ontology/place> <http://dbpedia.org/resource/Italy> .\n      }\n    }\n'

In [92]:
response = requests.post(url="http://dbpedia.org/sparql",
                            params={"query": sparql},
                            headers={'accept': 'application/sparql-results+json'}).json()

In [93]:
response

{'head': {'link': [], 'vars': ['a']},
 'results': {'distinct': False,
  'ordered': True,
  'bindings': [{'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/War_of_the_Polish_Succession'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Gothic_War_(535–554)'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Siege_of_Gaeta_(1815)'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Battle_of_Placentia_(194_BC)'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Siege_of_Mirandola_(1551)'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Battle_of_Borghetto'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Battle_of_Magnano'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Battle_of_Tornavento'}},
   {'a': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Battle_of_Carpi_(1815)'}},
   {'a': {'type': 'uri',
     'value': 'ht