In [3]:
import pandas as pd
import json
import unidecode
import random
import pickle

relation_list = ["http://dbpedia.org/ontology/populationTotal",
                "http://dbpedia.org/ontology/genre",
                "http://dbpedia.org/property/timezone",
                "http://dbpedia.org/ontology/timeZone",
                "http://dbpedia.org/ontology/birthPlace",
                "http://dbpedia.org/ontology/location",
                "http://dbpedia.org/property/mapCaption",
                "http://dbpedia.org/property/location",
                "http://dbpedia.org/ontology/isPartOf",
                "http://dbpedia.org/ontology/position",
                "http://dbpedia.org/ontology/deathPlace",
                "http://dbpedia.org/ontology/writer",
                "http://dbpedia.org/ontology/artist",
                "http://dbpedia.org/ontology/country",
                "http://dbpedia.org/ontology/recordLabel",
                "http://dbpedia.org/ontology/literaryGenre",
                "http://dbpedia.org/ontology/type",
                "http://dbpedia.org/ontology/director",
                "http://dbpedia.org/ontology/language",
                "http://dbpedia.org/ontology/hometown",
                "http://dbpedia.org/ontology/producer",
                "http://dbpedia.org/ontology/author"]

In [4]:
#train_df[train_df.predicate == 'http://dbpedia.org/ontology/producer']

In [5]:
with open('../data/SimpleQuestionsDBpedia/train.json') as json_file:
    train = json.load(json_file)
    
with open('../data/SimpleQuestionsDBpedia/test.json') as json_file:
    test = json.load(json_file)
    
with open('../data/SimpleQuestionsDBpedia/valid.json') as json_file:
    valid = json.load(json_file)    

In [6]:
def get_questions_and_predicates(data):
    questions, predicates = list(), list()

    for question in data['Questions']:
        text = unidecode.unidecode(question['Query'])
        predicate = question['PredicateList'][0]['Predicate']

        questions.append(text)
        predicates.append(predicate)
    
    return questions, predicates

In [7]:
train_questions, train_predicates = get_questions_and_predicates(train)
test_questions, test_predicates = get_questions_and_predicates(test)
val_questions, val_predicates = get_questions_and_predicates(valid)

In [8]:
train_df = pd.DataFrame.from_dict({'question': train_questions, 'predicate': train_predicates})
test_df = pd.DataFrame.from_dict({'question': test_questions, 'predicate': test_predicates})
val_df = pd.DataFrame.from_dict({'question': val_questions, 'predicate': val_predicates})

In [9]:
genre = train_df[train_df['predicate'].isin(['http://dbpedia.org/ontology/genre'])].sample(600)
birth_place = train_df[train_df['predicate'].isin(['http://dbpedia.org/ontology/birthPlace'])].sample(600)

In [10]:
genre.shape

(600, 2)

In [11]:
train_df.head()

Unnamed: 0,predicate,question
0,http://dbpedia.org/ontology/distributor,what movie is produced by warner bros.
1,http://purl.org/linguistics/gold/hypernym,What is don graham known as?
2,http://dbpedia.org/ontology/location,what's there to see in columbus
3,http://dbpedia.org/ontology/birthPlace,who is a musician born in detroit
4,http://dbpedia.org/ontology/hometown,Which city did the artist ryna originate in


In [12]:
train_df = train_df[train_df['predicate'].isin(relation_list)]
test_df = test_df[test_df['predicate'].isin(relation_list)]
#train_df = train_df.append(test_df).append(genre).append(birth_place)

val_df = val_df[val_df['predicate'].isin(relation_list)]

In [13]:
train_df['predicate'].value_counts()

http://dbpedia.org/ontology/genre            4919
http://dbpedia.org/ontology/birthPlace       3577
http://dbpedia.org/ontology/isPartOf         1730
http://dbpedia.org/ontology/position         1254
http://dbpedia.org/ontology/deathPlace       1045
http://dbpedia.org/ontology/writer            996
http://dbpedia.org/ontology/artist            994
http://dbpedia.org/ontology/country           889
http://dbpedia.org/ontology/recordLabel       776
http://dbpedia.org/ontology/literaryGenre     685
http://dbpedia.org/ontology/type              590
http://dbpedia.org/ontology/director          494
http://dbpedia.org/ontology/language          464
http://dbpedia.org/ontology/timeZone          411
http://dbpedia.org/ontology/location          411
http://dbpedia.org/ontology/hometown          407
http://dbpedia.org/ontology/producer          403
http://dbpedia.org/ontology/author            347
Name: predicate, dtype: int64

In [14]:
len(val_df['predicate'].value_counts())

18

In [15]:
train_df.shape

(20392, 2)

In [16]:
test_df.shape

(5816, 2)

In [31]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')

[nltk_data] Downloading package wordnet to /home/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
def preprocess_text(text, remove_stopwords=False):
    # clean_ascii
    tmp = "".join(i for i in text if ord(i) < 128)
    #lowercase
    tmp = tmp.lower()
    #normal form
    tokens = tokenizer.tokenize(tmp)
    #stopwords
    if remove_stopwords:
        prep_text = ' '.join(t for t in tokens if t not in stopwords)
    else:
        prep_text = ' '.join(t for t in tokens)
    
    return prep_text

In [142]:
%%time

train_df['clean text'] = train_df['question'].apply(lambda x: preprocess_text(x))
#test_df['clean text'] = test_df['question'].apply(lambda x: preprocess_text(x))
val_df['clean text'] = val_df['question'].apply(lambda x: preprocess_text(x))

CPU times: user 467 ms, sys: 0 ns, total: 467 ms
Wall time: 465 ms


In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

seed = 42

In [144]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))
vectorizer.fit(train_df.append(val_df)['clean text'])

pickle.dump(vectorizer, open("relation_vectorizer.model", 'wb'))

In [145]:
encoder = LabelEncoder()
encoder.fit(train_df.append(val_df)['predicate'])

pickle.dump(encoder, open("relation_encoder.model", 'wb'))

In [146]:
print("Total number of unique n_grams in data = {0}".format(len(list(vectorizer.vocabulary_.keys()))))

Total number of unique n_grams in data = 3402


In [147]:
X_train = vectorizer.transform(train_df['clean text'])
y_train = encoder.transform(train_df['predicate'])

In [148]:
classifier = LogisticRegression(random_state=seed).fit(X_train, y_train)

pickle.dump(classifier, open("relation_classifier.model", 'wb'))

In [149]:
X_test = vectorizer.transform(val_df['clean text'])
y_test = encoder.transform(val_df['predicate'])

In [150]:
y_pred = classifier.predict(X_test)

In [151]:
print(classification_report(y_test, y_pred,target_names = encoder.inverse_transform([i for i in range(18)])))

                                           precision    recall  f1-score   support

       http://dbpedia.org/ontology/artist       0.90      0.89      0.89       137
       http://dbpedia.org/ontology/author       0.93      0.63      0.75        60
   http://dbpedia.org/ontology/birthPlace       0.92      0.97      0.94       482
      http://dbpedia.org/ontology/country       0.78      0.40      0.53       134
   http://dbpedia.org/ontology/deathPlace       0.99      0.98      0.99       143
     http://dbpedia.org/ontology/director       0.99      0.97      0.98        77
        http://dbpedia.org/ontology/genre       0.93      0.99      0.96       705
     http://dbpedia.org/ontology/hometown       0.77      0.57      0.65        63
     http://dbpedia.org/ontology/isPartOf       0.71      0.90      0.79       242
     http://dbpedia.org/ontology/language       1.00      0.93      0.96        59
http://dbpedia.org/ontology/literaryGenre       0.98      0.91      0.94       100
   

  if diff:


In [152]:
text = preprocess_text("was angela merkel born in hamburg?")
vector = vectorizer.transform([text])
prediction = classifier.predict(vector)
probas = classifier.predict_proba(vector)
class_ = encoder.inverse_transform(prediction)

print(class_)

['http://dbpedia.org/ontology/birthPlace']


  if diff:


In [35]:
probas

array([[0.98535167, 0.00199158, 0.00365895, 0.00899779]])

## Query template Classifier (BOOL, DISTANCE, FORWARD/BACKWARD)

In [17]:
distance = pd.read_csv("../data/ManuallyGeneratedData/distance.intent", sep='\t')
cities = pd.read_csv("../data/ManuallyGeneratedData/world-cities.csv")

distance_templates = distance.question.values
cities = list(cities.name.values)

distance_questions = list()

for i in range(15):
    for template in distance_templates:
        city_1 = random.choice(cities)
        cities.remove(city_1)
        city_2 = random.choice(cities)
        cities.remove(city_2)

        template = template.replace("X", unidecode.unidecode(city_1))
        template = template.replace("Y", unidecode.unidecode(city_2))

        distance_questions.append(template)

In [18]:
lc_quad_train = pd.read_csv("../data/LC-QuAD/train-data.csv", sep=';')
lc_quad_test = pd.read_csv("../data/LC-QuAD/test-data.csv", sep=';')

lc_quad = lc_quad_train.append(lc_quad_test)
lc_quad = lc_quad[lc_quad['class'] == 'BOOL']

In [19]:
lc_quad['class'].value_counts()

BOOL    368
Name: class, dtype: int64

In [20]:
qald_train = pd.read_csv("../data/QALD/QALD-train.csv", sep=';')
qald_test = pd.read_csv("../data/QALD/QALD-test.csv", sep=';')

qald = qald_train.append(qald_test)
qald = qald[qald['class'] == 'BOOL']

In [21]:
qald['class'].value_counts()

BOOL    41
Name: class, dtype: int64

In [22]:
boolean_questions = list(lc_quad.question.values) + list(qald.question.values)

In [23]:
len(boolean_questions)

409

In [24]:
forward_questions = list()
backward_questions = list()

for question in train['Questions']:
    if question['PredicateList'][0]['Direction'] == 'backward':
        backward_questions.append(question['Query'])
    elif question['PredicateList'][0]['Direction'] == 'forward':
        forward_questions.append(question['Query'])
        
fwd_bwd_questions = random.sample(forward_questions + backward_questions, 500)

In [25]:
len(fwd_bwd_questions)

500

In [26]:
X = distance_questions + boolean_questions + fwd_bwd_questions
y = ['distance' for i in range(len(distance_questions))] + ['boolean' for i in range(len(boolean_questions))] + ['FWD_BWD' for i in range(len(fwd_bwd_questions))]

In [27]:
dataframe = pd.DataFrame.from_dict({'question': X, 'class': y})

In [32]:
dataframe['clean text'] = dataframe['question'].apply(lambda x: preprocess_text(x))

In [44]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))
vectorizer.fit(dataframe['clean text'])

pickle.dump(vectorizer, open("template_vectorizer.model", 'wb'))

NameError: name 'TfidfVectorizer' is not defined

In [45]:
encoder = LabelEncoder()
encoder.fit(dataframe['class'])

pickle.dump(encoder, open("template_encoder.model", 'wb'))

NameError: name 'LabelEncoder' is not defined

In [33]:
print("Total number of unique n_grams in data = {0}".format(len(list(vectorizer.vocabulary_.keys()))))

NameError: name 'vectorizer' is not defined

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(dataframe['clean text'], dataframe['class'], test_size=0.33, random_state=42)

In [37]:
X_train.shape

(810,)

In [38]:
X_test.shape

(399,)

In [48]:
X_train = vectorizer.transform(X_train)
y_train = encoder.transform(y_train)

NameError: name 'vectorizer' is not defined

In [234]:
classifier = LogisticRegression(random_state=seed).fit(X_train, y_train)
pickle.dump(classifier, open("template_classifier.model", 'wb'))

In [235]:
X_test = vectorizer.transform(X_test)
y_test = encoder.transform(y_test)

In [236]:
y_pred = classifier.predict(X_test)

In [237]:
print(classification_report(y_test, y_pred,target_names = encoder.inverse_transform([0, 1, 2, ])))

             precision    recall  f1-score   support

    FWD_BWD       0.98      1.00      0.99       158
    boolean       1.00      0.97      0.99       141
   distance       1.00      1.00      1.00       100

avg / total       0.99      0.99      0.99       399



  if diff:


In [238]:
text = preprocess_text("was angela merkel born in hamburg")
print(text)
vector = vectorizer.transform([text])
prediction = classifier.predict(vector)
probas = classifier.predict_proba(vector)
class_ = encoder.inverse_transform(prediction)

print(class_)

was angela merkel born in hamburg
['boolean']


  if diff:


In [176]:
probas

array([[0.6223405 , 0.26254598, 0.11511353]])

In [177]:
max(probas[0])

0.6223404984596098

## FWD-BWD Classifier

In [39]:
X = forward_questions + backward_questions
y = ['forward' for i in range(len(forward_questions))] + ['backward' for i in range(len(backward_questions))]

In [40]:
dataframe = pd.DataFrame.from_dict({'question': X, 'class': y})

In [41]:
dataframe['clean text'] = dataframe['question'].apply(lambda x: preprocess_text(x))

In [181]:
vectorizer = TfidfVectorizer(min_df=3,ngram_range=(1,1))
vectorizer.fit(dataframe['clean text'])

pickle.dump(vectorizer, open("fwd_bwd_vectorizer.model", 'wb'))

In [182]:
encoder = LabelEncoder()
encoder.fit(dataframe['class'])

pickle.dump(encoder, open("fwd_bwd_encoder.model", 'wb'))

In [42]:
X_train, X_test, y_train, y_test = train_test_split(dataframe['clean text'], dataframe['class'], test_size=0.33, random_state=42)

In [43]:
X_train.shape

(20224,)

In [44]:
X_test.shape

(9962,)

In [184]:
X_train = vectorizer.transform(X_train)
y_train = encoder.transform(y_train)

In [185]:
classifier = LogisticRegression(random_state=seed).fit(X_train, y_train)
pickle.dump(classifier, open("fwd_bwd_classifier.model", 'wb'))

In [186]:
X_test = vectorizer.transform(X_test)
y_test = encoder.transform(y_test)

In [187]:
y_pred = classifier.predict(X_test)

In [188]:
print(classification_report(y_test, y_pred,target_names = encoder.inverse_transform([0, 1, ])))

             precision    recall  f1-score   support

   backward       0.93      0.88      0.91      3366
    forward       0.94      0.97      0.95      6596

avg / total       0.94      0.94      0.94      9962



  if diff:
