Download json data from here: http://dbpedia.org/snorql

I used the following query for actors (changed "Actor" to "City",... for the rest): 

PREFIX dbpedia0: <http://dbpedia.org/ontology/>

PREFIX dbpedia2: <http://dbpedia.org/property/>

PREFIX dbpedia: <http://dbpedia.org/resource/>

select distinct ?name ?abstract where {

         ?instance a dbpedia0:Actor.
         
         ?instance foaf:name ?name.
         
         ?instance dbpedia0:abstract ?abstract.
         
         filter(langMatches(lang(?abstract),"en"))
         }

#### Load json files:

In [1]:
import json
#from pprint import pprint

data_actor = json.load(open('musicalArtist.json'))
data_city = json.load(open('city.json'))
data_celestialbody = json.load(open('celestialbody.json'))
data_education = json.load(open('educationalInstitution.json'))
data_lake = json.load(open('lake.json'))

## Create Training- and testing dataframe:

### 1. Use Data to create Lists

In [2]:
# Person(Actor)

person_names = []
person_abstracts = []

length_p = len(data_actor['results']['bindings'])

for i in range(length_p):
    name = data_actor['results']['bindings'][i]['name']['value']
    abstract = data_actor['results']['bindings'][i]['abstract']['value']
    person_names.append(name)
    person_abstracts.append(abstract)


# City
city_name = []
city_abstract = []

length_c = len(data_city['results']['bindings'])

for j in range(length_c):
    name = data_city['results']['bindings'][j]['name']['value']
    abstract =  data_city['results']['bindings'][j]['abstract']['value']
    city_name.append(name)
    city_abstract.append(abstract)
    
    
# CelestialBody
cb_name = []
cb_abstract = []

length_cb = len(data_celestialbody['results']['bindings'])

for k in range(length_cb):
    name = data_celestialbody['results']['bindings'][k]['name']['value']
    abstract =  data_celestialbody['results']['bindings'][k]['abstract']['value']
    cb_name.append(name)
    cb_abstract.append(abstract)
    
    
# EducationalInstitution
ei_name = []
ei_abstract = []

length_ei = len(data_education['results']['bindings'])

for l in range(length_ei):
    name = data_education['results']['bindings'][l]['name']['value']
    abstract =  data_education['results']['bindings'][l]['abstract']['value']
    ei_name.append(name)
    ei_abstract.append(abstract)
    
    
# Lake
lake_name = []
lake_abstract = []

length_lake = len(data_lake['results']['bindings'])

for m in range(length_lake):
    name = data_lake['results']['bindings'][m]['name']['value']
    abstract =  data_lake['results']['bindings'][m]['abstract']['value']
    lake_name.append(name)
    lake_abstract.append(abstract)


### 2. Create dataframes:

In [3]:
import pandas as pd


person = ['Person']*length_p
person_df = pd.DataFrame(
    {'Names': person_names,
     'Abstract': person_abstracts,
     'Class': person
    })
    
city = ['City']*length_c
city_df = pd.DataFrame(
    {'Names': city_name,
     'Abstract': city_abstract,
     'Class': city
    })

celestialbody = ['CelestialBody']*length_cb
cb_df = pd.DataFrame(
        {'Names': cb_name,
        'Abstract': cb_abstract,
        'Class': celestialbody})

education = ['EducationalInstitution']*length_ei
ei_df = pd.DataFrame(
        {'Names': ei_name,
        'Abstract': ei_abstract,
        'Class': education})

lake = ['Lake']*length_lake
lake_df = pd.DataFrame(
        {'Names': lake_name,
        'Abstract': lake_abstract,
        'Class': lake})

### 3. Merge dataframes:

In [4]:
frames = [cb_df, person_df, ei_df, lake_df, city_df]   

whole_df = pd.concat(frames)

### 4. Split dataframe into testing and training: (random split)

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(whole_df, test_size=0.2)


### 5. Separate dataframes into data(X) and class labels(y).

In [6]:
y_train = train.iloc[:,[1]]
y_test = test.iloc[:,[1]]

X_train = train.iloc[:,[0]]
X_test = test.iloc[:,[0]]


## Preprocessing


In [24]:
from sklearn.feature_extraction.text import HashingVectorizer

hash_vectorizer = HashingVectorizer(stop_words='english')
hash_train = hash_vectorizer.fit_transform(X_train.Abstract)
hash_test = hash_vectorizer.transform(X_test.Abstract)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train.Abstract)
tfidf_test = tfidf_vectorizer.transform(X_test.Abstract)

## Training and Testing

### 1. Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import itertools

y_train = np.ravel(y_train)

rfclf = RandomForestClassifier()
rfclf.fit(hash_train, y_train)

pred = rfclf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

#cm = metrics.confusion_matrix(y_test, pred, labels=['City', 'Person'])
#plot_confusion_matrix(cm, classes=['City', 'Person'])

accuracy:   1.000


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import numpy as np
import itertools

y_train = np.ravel(y_train)

rfclf = RandomForestClassifier()
rfclf.fit(tfidf_train, y_train)

pred = rfclf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.986


### 2. K-nearest neighbors

In [136]:
from sklearn import neighbors
from sklearn import metrics
import numpy as np

y_train = np.ravel(y_train)

clf = neighbors.KNeighborsClassifier(10, weights = 'distance')
clf.fit(hash_train, y_train)

pred = clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

  after removing the cwd from sys.path.


accuracy:   0.993


In [8]:
from sklearn import neighbors
from sklearn import metrics
import numpy as np

y_train = np.ravel(y_train)

clf = neighbors.KNeighborsClassifier(100, weights = 'uniform')
clf.fit(tfidf_train, y_train)

pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.976


### 3. Stochastic Gradient Descent

In [22]:
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import numpy as np

y_train = np.ravel(y_train)

sgdclf = SGDClassifier(loss="modified_huber", penalty="l2")
sgdclf.fit(hash_train, y_train)

pred = sgdclf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)



accuracy:   0.994


In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import numpy as np

y_train = np.ravel(y_train)

sgdclf = SGDClassifier(loss="hinge", penalty="elasticnet")
sgdclf.fit(tfidf_train, y_train)

pred = sgdclf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)



accuracy:   0.994


## Analysis

### Look for those examples that were classified wrong: (5 datasets: Actor/Person, city, celestialBody, educationalInstitution, Lake)

In [9]:
# construct lists, so it's easy accessibel with indices.
classes = []
for e in y_test.Class:
    classes.append(e)

In [10]:
abstracts = []
for a in test.Abstract:
    abstracts.append(a)

In [11]:
# count how often each of the test abstracts were mistaken for the wrong class.
p_c = 0
p_cb = 0
p_ei = 0
p_l = 0
c_p = 0
c_cb = 0
c_ei = 0
c_l = 0
cb_c = 0
cb_p = 0
cb_ei = 0
cb_l = 0
ei_c = 0
ei_p = 0
ei_cb= 0
ei_l = 0
l_p = 0
l_c = 0
l_cb = 0
l_ei = 0
false = []
for i in range(len(classes)):
    cl = classes[i]  # real label 
    pr = pred[i]  # predicted label
    
    # How often is which class confused with which other class?
    if cl != pr:
        false.append(i)  # create List with all indizees that were classified wrong.
        if cl == 'Person':
            if pr == 'City':
                p_c+=1
            elif pr == 'CelestialBody':
                p_cb+=1
            elif pr == 'EducationalInstitution':
                p_ei+=1
            elif pr == 'Lake':
                p_l+=1
        elif cl == 'City':
            if pr == 'Person':
                c_p+=1
            elif pr == 'CelestialBody':
                c_cb+=1
            elif pr == 'EducationalInstitution':
                c_ei+=1
            elif pr == 'Lake':
                c_l+=1
        elif cl == 'CelestialBody':
            if pr == 'City':
                cb_c+=1
            elif pr == 'Person':
                cb_p+=1
            elif pr == 'EducationalInstitution':
                cb_ei+=1
            elif pr == 'Lake':
                cb_l+=1
        elif cl == 'EducationalInstitution':
            if pr == 'City':
                ei_c+=1
            elif pr == 'CelestialBody':
                ei_cb+=1
            elif pr == 'Person':
                ei_p+=1
            elif pr == 'Lake':
                ei_l+=1
        elif cl == 'Lake':
            if pr == 'City':
                l_c+=1
            elif pr == 'CelestialBody':
                l_cb+=1
            elif pr == 'Person':
                l_p+=1
            elif pr == 'EducationalInstitution':
                l_ei+=1
        else:
            print('Wrong Class')

In [21]:
# look for explicit example
# for a in range(len(abstracts)):
#    if abstracts[a] == "Heather Leigh West is a New York City based American recording artist best known for her work in house music.":
#        print(a)
    

In [12]:
print(len(false))

231


In [30]:
# look at abstracts, that were not classified correctly
ex = false[5]
print('Abstrcat: \n' + abstracts[ex] + '\n')
print('is: ' + classes[ex])
print('labeled as: '+ pred[ex])

Abstrcat: 
Colico is a city in the province of Lecco, Lombardy, Italy. It is situated on the northern arm of Lake Como, where the river Adda enters the lake. Colico is the most important city in the northern part of Lake Como, which is often identified as its Colico branch. Colico is a local transport hub, with boats to Como and Lecco, as well as trains and roads to Milan (via the eastern shore of the lake, Lecco and Brianza), to Chiavenna, and eastwards to Bolzano, via Passo dello Stelvio. The Piona Abbey is located in the communal territory, in the Olgiasca peninsula.

is: City
labeled as: Lake


In [16]:

print('Class = Person')
print(p_c)
print(p_cb)
print(p_ei)
print(p_l)
print('\n')

print('Class = City')
print(c_p)
print(c_cb)
print(c_ei)
print(c_l)
print('\n')

print('Class = CelestialBody')
print(cb_p)
print(cb_c)
print(cb_ei)
print(cb_l)
print('\n')

print('Class = EducationalInstitute')
print(ei_p)
print(ei_c)
print(ei_cb)
print(ei_l)
print('\n')

print('Class = lake')
print(l_p)
print(l_c)
print(l_cb)
print(l_ei)
print('\n')

print('City-Person')
print(p_c + c_p)

print('Person-CB')
print(p_cb + cb_p)

print('Person-ei')
print(ei_p+p_ei)

print('Person-Lake')
print(p_l+l_p)

print('City_cb')
print(c_cb+cb_c)

print('city-ei')
print(c_ei+ei_c)

print('city-lake')
print(c_l+l_c)

print('cb-lake')
print(cb_l+l_cb)

print('ei-cb')
print(cb_ei + ei_cb)

print('ei-lake')
print(ei_l+l_ei)


Class = Person
3
1
3
0


Class = City
5
0
15
3


Class = CelestialBody
0
0
0
0


Class = EducationalInstitute
1
2
0
0


Class = lake
2
3
0
0


City-Person
8
Person-CB
1
Person-ei
4
Person-Lake
2
City_cb
0
city-ei
17
city-lake
6
cb-lake
0
ei-cb
0
ei-lake
0
