In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports

In [5]:
! pip install --upgrade gensim



In [6]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_documents
from gensim import corpora, models
from gensim import similarities

# Load the data

In [7]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Information Retrieval/Controls_data.csv")

df

Unnamed: 0,control,description,control_id,label
0,Alert Police,"Armed robbery, in criminal law, aggravated for...",Armed Robbery,Crime
1,Alert Police,A biological attack is the intentional release...,Biochemical Attack,Terrorism
2,Alert Police,an attack or attacks on a place or area using ...,Bomb Attack,Terrorism
3,Alert Police,The theft of an automobile from its driver by ...,Carjacking,Crime
4,Alert Police,ATM fraud refers to fraud with the use of an A...,ATM Fraud,Crime


# Data Preprocessing

In [111]:
docs = df.description.tolist()

docs = preprocess_documents(docs)

dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(i) for i in docs]

len(corpus)

5

# Topic Modelling Approach

In [112]:
# LSI
model = models.LsiModel(corpus, id2word=dictionary, num_topics=20)

# LDA
# model = models.LdaModel(corpus, id2word=dictionary, num_topics=20)

# Ensemble LDA
# model = models.EnsembleLda(
#     corpus=corpus,
#     id2word=dictionary,
#     num_topics=20,
#     num_models=5,
#     topic_model_class=models.LdaModel
# )

In [113]:
# Prepare Similarity Matrix
index = similarities.MatrixSimilarity(model[corpus])

# index.save('model')
# index = similarities.MatrixSimilarity.load('model')

## Inference

In [114]:
doc = "Seven men have been arrested for allegedly duping several people at ATM kiosks across Delhi-NCR."

# Convert doc to vec
doc = preprocess_documents([doc])[0]
doc = dictionary.doc2bow(doc)
doc_vec = model[doc]  

print(doc_vec)

[(0, 0.11791125732667256), (1, -0.5121795415170113), (2, 0.14342593280390473), (3, -0.13811709867390598), (4, -0.007245045659168044)]


In [115]:
result = index[doc_vec]

result = sorted(enumerate(result), key=lambda item: -item[1])

for doc_position, doc_score in result:
    print(doc_score, df.control_id[doc_position])

0.9322567 ATM Fraud
0.3377323 Bomb Attack
0.15725304 Biochemical Attack
-1.6075401e-09 Carjacking
-2.164143e-09 Armed Robbery


# Word Embeddings Approach

In [118]:
! pip install -q spacy

In [123]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [137]:
doc = "Seven men have been arrested for allegedly duping several people at ATM kiosks across Delhi-NCR."

doc = preprocess_documents([doc])
doc = ' '.join(doc[0])

result = []
for i in [" ".join(i) for i in docs]:
  doc1 = nlp(i)
  doc2 = nlp(doc)
  result.append(doc1.similarity(doc2))

print(result)

[0.5890487623316997, 0.4947408615022601, 0.5740411488516963, 0.4884656125452332, 0.6235829108370134]


In [142]:
output = pd.DataFrame({
    'Control': df.control_id,
    'Scores': result
})

output.sort_values('Scores', ascending=False)

Unnamed: 0,Control,Scores
4,ATM Fraud,0.623583
0,Armed Robbery,0.589049
2,Bomb Attack,0.574041
1,Biochemical Attack,0.494741
3,Carjacking,0.488466


# Zero Shot Classification Approach

In [1]:
! pip install -q transformers

from transformers import pipeline

deberta_classifier = pipeline("zero-shot-classification",
                              model="Narsil/deberta-large-mnli-zero-cls")

Downloading:   0%|          | 0.00/729 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Some weights of the model checkpoint at Narsil/deberta-large-mnli-zero-cls were not used when initializing DebertaForSequenceClassification: ['config']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [9]:
text = "Seven men have been arrested for allegedly duping several people at ATM kiosks across Delhi-NCR."

labels = df.control_id.tolist()

output = deberta_classifier(text, labels, multi_label=False)

output = pd.DataFrame({
    'labels': output['labels'],
    'scores': output['scores']
})

output

Unnamed: 0,labels,scores
0,ATM Fraud,0.878206
1,Armed Robbery,0.038311
2,Biochemical Attack,0.033892
3,Carjacking,0.031222
4,Bomb Attack,0.018369
