In [1]:
import pandas as pd
import spacy
import numpy as np
import syntok.segmenter as segmenter
from flair.models import SequenceTagger
from flair.data import Sentence

flairTagger = SequenceTagger.load("de-ner")

  return torch._C._cuda_getDeviceCount() > 0


2021-01-24 20:28:41,530 loading file /home/center/.flair/models/de-ner-conll03-v0.4.pt


In [2]:
# ## Prepare
# ### Load Spacy model

nlp = spacy.load("de_core_news_sm")

# ### Read reports

pol_df = pd.read_json("policereports.json")

In [3]:

# ## Process
# ### Tokenize with Spacy

pol_df_extended = pol_df.copy()

pol_df_extended["Token"] = pol_df["Content"].apply(lambda doc: nlp(doc))

In [4]:
# ### Named entity recognition
# Only locations


pol_df_extended["LocationsFromNER"] = pol_df_extended["Token"].apply(
    lambda doc: {ent for ent in doc.ents if ent.label_ == "LOC"}
)

In [5]:
# ### Sentence offsets

pol_df_extended["SentenceOffsets"] = pol_df_extended["Content"].apply(
    lambda content: np.asarray(
        [list(sent)[0].offset for parag in segmenter.analyze(content) for sent in parag]
        + [len(content)]
    )
)


In [6]:
# ### NE offsets


pol_df_extended["LocationsOffsets"] = pol_df_extended["LocationsFromNER"].apply(
    lambda locations: np.asarray([location.start_char for location in locations])
)


In [7]:
# ### Relevant sentence indices


def getRelevantSentenceIndexes(row):
    sentenceOffsets = row[0]
    locationsOffsets = row[1]
    return set((np.searchsorted(sentenceOffsets, locationsOffsets) - 1).clip(min=0))


pol_df_extended["RelevantSentenceIndexes"] = pol_df_extended[
    ["SentenceOffsets", "LocationsOffsets"]
].apply(getRelevantSentenceIndexes, axis=1)



In [8]:
# ### Relevant sentences


def pairwise(sequence):
    return np.asarray(list(zip(sequence[:-1], sequence[1:])))


def getRelevantSentences(row):
    contentString = row[0]
    sentenceOffsets = row[1]
    relevantSentenceIndexes = row[2]
    sentenceOffsetPairs = pairwise(sentenceOffsets)
    relevantSentences = []
    for idx in relevantSentenceIndexes:
        start, end = sentenceOffsetPairs[idx]
        relevantSentences.append(Sentence(contentString[start:end], use_tokenizer=True))
    return relevantSentences


pol_df_extended["RelevantSentences"] = pol_df_extended[
    ["Content", "SentenceOffsets", "RelevantSentenceIndexes"]
].apply(getRelevantSentences, axis=1)


def getLocationsFromFlair(relevantSentences):
    predictions = flairTagger.predict(relevantSentences)
    named_entities = []
    for p in predictions:
        current_spans = p.get_spans("ner")
        for span in current_spans:
            if span.tag == "LOC":
                named_entities.append(span.text)
    return set(named_entities)


In [12]:
pol_df.head().to_csv("testsample_policereports.csv")

In [9]:
pol_df_extended.head()

Unnamed: 0,Header,IsLocationInHeader,Title,URL,CreatedAt,Content,Token,LocationsFromNER,SentenceOffsets,LocationsOffsets,RelevantSentenceIndexes,RelevantSentences
0,Polizeimeldung vom 31.12.2014 Neukölln,True,"Geschossen, gebissen, geflüchtet",https://www.berlin.de/polizei/polizeimeldungen...,2018-11-24,Nr. 3099 Zu einem Überfall auf eine Spielothe...,"(Nr., 3099, , Zu, einem, Überfall, auf, eine,...","{(Britzer, Damm), (Neukölln), (Jahnstraße)}","[0, 93, 263, 356, 389, 490, 609, 664, 740]","[181, 83, 597]","{0, 1, 5}","[(Token: 1 Nr., Token: 2 3099, Token: 3 Zu, To..."
1,Polizeimeldung vom 31.12.2014 Reinickendorf,True,Seniorin beim Unfall schwer verletzt,https://www.berlin.de/polizei/polizeimeldungen...,2018-11-24,Nr. 3098 Ein Fußgängerin wurde gestern Nachmi...,"(Nr., 3098, , Ein, Fußgängerin, wurde, gester...","{(Hermsdorfer, Damm), (Hermsdorf), (Straße), (...","[0, 98, 286, 413, 492]","[174, 71, 267, 203]","{0, 1}","[(Token: 1 Nr., Token: 2 3098, Token: 3 Ein, T..."
2,Polizeimeldung vom 31.12.2014 Charlottenburg -...,True,Einbruch in Pfandleihhaus,https://www.berlin.de/polizei/polizeimeldungen...,2018-11-24,Nr. 3097 Unbekannte brachen heute früh in ein...,"(Nr., 3097, , Unbekannte, brachen, heute, frü...","{(Krummestraße), (Charlottenburg), (Einbrecher...","[0, 84, 318, 401, 485, 537, 616]","[471, 64, 511, 568, 285, 501, 262]","{0, 1, 3, 4, 5}","[(Token: 1 Nr., Token: 2 3097, Token: 3 Unbeka..."
3,Polizeimeldung vom 30.12.2014 Tempelhof - Schö...,True,Lokal überfallen,https://www.berlin.de/polizei/polizeimeldungen...,2018-11-24,Nr. 3094 Heute früh wurden Polizisten nach Te...,"(Nr., 3094, , Heute, früh, wurden, Polizisten...","{(Tempelhof), (Feurigstraße)}","[0, 95, 224, 304, 444, 482, 513, 589]","[44, 157]","{0, 1}","[(Token: 1 Nr., Token: 2 3094, Token: 3 Heute,..."
4,Polizeimeldung vom 30.12.2014 Mitte,True,960 Böller beschlagnahmt,https://www.berlin.de/polizei/polizeimeldungen...,2018-11-24,Nr. 3093 Zivilfahnder des Polizeiabschnitts 3...,"(Nr., 3093, , Zivilfahnder, des, Polizeiabsch...","{(Tüte), (Müller-, Ecke), (Wedding), (Lindower...","[0, 132, 291, 374, 417, 524, 570, 693]","[363, 171, 81, 184]","{0, 1, 2}","[(Token: 1 Nr., Token: 2 3093, Token: 3 Zivilf..."


In [10]:
# ## Mini-Batch processing

start = 400
for end in range(500, pol_df.shape[0], 100):
    pol_df_extended["LocationFromNERflair"] = pol_df_extended["RelevantSentences"][
        start:end
    ].apply(getLocationsFromFlair)
    pol_df_extended.to_csv(f"Output/output_{end}.csv")
    start = end

end = pol_df.shape[0]
pol_df_extended["LocationFromNERflair"] = pol_df_extended["RelevantSentences"][
    start:end
].apply(getLocationsFromFlair)
pol_df_extended.to_csv(f"Output/output_{end}.csv")


TypeError: 'NoneType' object is not iterable

In [None]:
# ## Result concatenation


frames = []
start = 0
for end in range(100, pol_df.shape[0], 100):
    current_frame = pd.read_csv(f"Output/output_{end}.csv")
    frames.append(current_frame[:][start:end])
    start = end

end = pol_df.shape[0]
current_frame = pd.read_csv(f"Output/output_{end}.csv")
frames.append(current_frame[:][start:end])


result = pd.concat(frames)
