In [1]:
#importing required libraries

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import spacy 
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json
from nltk.stem.porter import PorterStemmer
import random
from spacy.training.example import Example
from spacy.tokens import Doc

In [2]:
nlp = spacy.load("en_core_web_lg")
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VARTUL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VARTUL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#importing data set

df = pd.read_csv("C:\\Users\\VARTUL\\Desktop\\DA07.csv")
print(df.shape)
df = df.dropna()

(5128, 10)


In [4]:
#Stopwords removal and Tokenization

df['Event Type'] = df['Event Type'].apply(str)
ls =df['Event Type'].to_list()
n = len(ls)
ps = ls.copy()
for i in range(0,n):
    s = ls[i]
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(s)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    ls[i].join(filtered_sentence)
    ls[i] = ls[i].lower()

In [5]:
#Removing unwanted words (not related to crime)

unique = list(set(ls))

        
words = []    
for i in unique:
    j = i.replace('(', " ").replace(')', " ").replace('/', " ").replace('-', " ")
    a=j.split(" ")
    for word in a:
        words.append(word.strip())
words = list(set(words))
words.sort()
rem = ['by', 'code', 'abled', 'act', 'corona', 'for', 'against', 'in', 'jam', 'media','information',
 'internet','government','mobile','model','on','pocket','phone','other','people','small','pick','person',
 'medium', 'mining','police','unclaimed','help','object','attempt','cyber','departments','fire','found',
 'differently','social','safety','personally','escort','missing','major','election','conduct','attempted','of','human','unknown','pollution','property','related','required','']
for i in rem:
    words.remove(i)

In [6]:
#classifying key words according to their severity 

l1 = ['trafficking', 'election Offences', 'explosive', 
      'murder','assault','riot', 
      'deadbody', 'fire', 'rape', 'ndps', ]
l2 = ['suicide', 'accident', 'smugling', 'violation','offenses',
     'crime', 'kidnap', 'harrassment', 'sexual'
      'domestic','assault'
     ]
l3 = ['gambling', 'offenses',  'forgery', 'threat','suspicious',
      'illegal', 'robbery', 'abuse','female','sos','theft','violence'
 'accident','animals','child','commotion','crimes','crime','dispute',
 'disputes','dacoity','dowry','encroachment','excise','illegal','108',
 '1090']

In [7]:
#function to create patterns

def create_patterns(t, ls):
    patterns = []
    for item in ls:
        pattern = {
            "label":t,
            "pattern":item
        }
        
        patterns.append(pattern)
    return patterns

In [8]:
pattern1 = create_patterns('L1', l1)
pattern2 = create_patterns('L2', l2)
pattern3 = create_patterns('L3', l3) 
patterns = pattern1+pattern2+pattern3

In [9]:
#generating rules with created patterns

def gen_rules(patterns):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("crime_ner")

In [10]:
gen_rules(patterns)

In [11]:
nlp = spacy.load('crime_ner')

In [12]:
#creating Train-dataset 

def test_model(model, text):
    doc = nlp(text)
    results = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
    if(len(entities) > 0):
        results = (text, {'entities': entities})
        return results

In [13]:
def save_data(file, data):
    with open(file, 'w', encoding = 'utf-8')as f:
        json.dump(data, f, indent=4)

In [14]:
res = []
TRAIN_DATA = []
for i in ls:
    result = test_model(nlp, i)
    res.append(result)
    if result != None:
        TRAIN_DATA.append(result)
save_data('data/crime_training_data.json', TRAIN_DATA)    

In [15]:
def load_data(file):
    with open(file, "r", encoding='utf-8') as f:
        data = json.load(f)
    return data

In [16]:
TRAIN_DATA = load_data('data/crime_training_data.json')

In [17]:
#Model Training process(Takes around 1hr)

def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe('ner', last = True)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print('Starting Itertions ' + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for batch in spacy.util.minibatch(TRAIN_DATA, size=2):
                for text, annotations in batch:
                    doc = nlp.make_doc(text)
                    example = Example.from_dict(doc, annotations)
                    nlp.update(
                            [example],
                            drop = 0.2,
                            sgd = optimizer,
                            losses = losses
                    )
            print(losses)
    return nlp

In [None]:


# nlp = train_spacy(TRAIN_DATA, 30)
# nlp.to_disk('crime_ner_model')

In [18]:
tlp = spacy.load('crime_ner_model')