In [None]:
!pip install pycaret

from pycaret.utils import enable_colab 
enable_colab()

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/33/4d/792832e86c34eb7f8c06f1805f19ef72a2d38b11435502b69fca3409b84c/pycaret-2.2.2-py3-none-any.whl (249kB)
[K     |█▎                              | 10kB 16.9MB/s eta 0:00:01[K     |██▋                             | 20kB 22.0MB/s eta 0:00:01[K     |████                            | 30kB 11.5MB/s eta 0:00:01[K     |█████▎                          | 40kB 8.8MB/s eta 0:00:01[K     |██████▋                         | 51kB 4.7MB/s eta 0:00:01[K     |███████▉                        | 61kB 5.3MB/s eta 0:00:01[K     |█████████▏                      | 71kB 5.5MB/s eta 0:00:01[K     |██████████▌                     | 81kB 6.0MB/s eta 0:00:01[K     |███████████▉                    | 92kB 5.9MB/s eta 0:00:01[K     |█████████████▏                  | 102kB 6.3MB/s eta 0:00:01[K     |██████████████▍                 | 112kB 6.3MB/s eta 0:00:01[K     |███████████████▊                | 122kB 6.3MB/s eta 

In [None]:
!pip install autokeras
!pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc4

# Load the data

In [56]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Psioninsights/riskClassificationDataset.csv')

df = df.sample(frac = 1, random_state=2020)

df.reset_index(drop=False, inplace=True)

df

Unnamed: 0,index,Control,Description,RiskType01,RiskType02,Source,Title,pubDate
0,31684,Terrorism 28,The October 2019 issue features a brief on the...,Terrorism,Terrorism,EMM,Militant Leadership Monitor – October 2019,"Sun, 23 Aug 2020 17:50:00 +0200"
1,7299,,"Rwanda National Police (RNP) on Thursday, Octo...",Crime,Human Trafficking,EMM,Three arrested for selling banned bleaching cr...,"Thu, 22 Oct 2020 19:12:00 +0200"
2,7977,Environment 102,Three ministers have bought cows by using an o...,Environment,Disease,EMM,Ministers buy Eid cattle from Digital Haat,"Sat, 11 Jul 2020 17:15:00 +0200"
3,10289,Environment 170,Hong Kong’s privacy chief criticized the U.S. ...,Environment,Disease,EMM,Hong Kong Privacy Chief Says U.S. ‘Doxxed’ San...,"Sun, 09 Aug 2020 09:19:00 +0200"
4,6084,Crime 65,The disappeared suffer crimes from kidnapping ...,Crime,Human Trafficking,EMM,"From children to young men, more than 73,000 a...","Tue, 14 Jul 2020 10:56:00 +0200"
...,...,...,...,...,...,...,...,...
33672,30648,Terrorism 6,Terrorism U.S. Efforts to Deal Islamic State “...,Terrorism,Terrorism,EMM,U.S. Efforts to Deal Islamic State “Enduring D...,"Fri, 17 Jul 2020 18:28:00 +0200"
33673,20039,,"LOUISVILLE, Ky—U.S. Customs and Border Protect...",Financial Crime,Financial Crime,EMM,"Three Days, $3.7 Million of Counterfeits Seize...","Fri, 11 Sep 2020 17:56:00 +0200"
33674,32387,,At least one person died in the capital Conakr...,Terrorism,Terrorism,EMM,Post-election violence breaks out in Guinea,"Thu, 22 Oct 2020 10:21:00 +0200"
33675,1661,,At least seven people are still missing in Bei...,Armed Conflict,Interstate Conflict,EMM,Seven people still missing one month after the...,"Sun, 30 Aug 2020 10:29:00 +0200"


In [57]:
df.RiskType01.value_counts(normalize=True)

Environment                0.340559
Crime                      0.102741
Terrorism                  0.100751
Armed Conflict             0.090240
Manmade Disaster           0.065920
Natural Disaster           0.064465
Operations                 0.058883
Financial Crime            0.046768
Project                    0.042047
Internal/External Fraud    0.031030
Civil                      0.030258
Technology                 0.026338
Name: RiskType01, dtype: float64

In [58]:
df['text'] = df.Title + df.Description

riskTypes = ['Operations', 'Environment', 'Natural Disaster', 'Crime', 'Armed Conflict', 'Terrorism']

df['label'] = ['Other' if label not in riskTypes else label for label in df.RiskType01]

In [59]:
df = df[['label', 'text']]

df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,label,text
0,Terrorism,Militant Leadership Monitor – October 2019The ...
1,Crime,Three arrested for selling banned bleaching cr...
2,Environment,Ministers buy Eid cattle from Digital HaatThre...
3,Environment,Hong Kong Privacy Chief Says U.S. ‘Doxxed’ San...
4,Crime,"From children to young men, more than 73,000 a..."
...,...,...
33670,Terrorism,U.S. Efforts to Deal Islamic State “Enduring D...
33671,Other,"Three Days, $3.7 Million of Counterfeits Seize..."
33672,Terrorism,Post-election violence breaks out in GuineaAt ...
33673,Armed Conflict,Seven people still missing one month after the...


# Text Cleaning

In [60]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def clean_text(data):

  nltk.download('wordnet')
  nltk.download('stopwords')

  stop_words = stopwords.words('english') #collate stopwords

  # Remove HTTP tags
  data = data.map(lambda x : ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))

  #Lower Case and remove leading/trailing spaces
  data = data.map(lambda x: x.lower().strip())

  #Remove punctuations
  data = data.map(lambda x: re.sub(r'[^\w\s]', '', x))

  #Remove unicodes
  data = data.map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

  #Remove numbers
  data = data.map(lambda x : ' '.join(re.sub(r'\w*\d+\w*', '', x).split()))

  # Remove stopwords
  data = data.map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

  # Lemmatize the text
  lemmer = WordNetLemmatizer()

  data = data.map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

  # Remove stopwords
  data = data.map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

  return data

df['text'] = clean_text(df['text'])

df

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,label,text
0,Terrorism,militant leadership monitor october october is...
1,Crime,three arrested selling banned bleaching creams...
2,Environment,minister buy eid cattle digital haatthree mini...
3,Environment,hong kong privacy chief say u doxxed sanctione...
4,Crime,child young men missing mexicothe disappeared ...
...,...,...
33670,Terrorism,u effort deal islamic state enduring defeat ho...
33671,Other,three day million counterfeit seized cbp louis...
33672,Terrorism,post election violence break guineaat least on...
33673,Armed Conflict,seven people still missing one month explosion...


# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

 ## Feature Engineering - Create tf-idf ##
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=500, strip_accents='ascii')

df_tf =  tfidf_vect.fit_transform(df['text'])

# Save the tf-idf

joblib.dump(tfidf_vect.vocabulary_, 'tfidf.joblib') 

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_tf

Unnamed: 0,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,arrested,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264697,0.0,0.0,0.0,0.0,0.0,0.247832,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.250475,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.467658,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.252692,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.240902,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.257343,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.294465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33670,0.0,0.0,0.169812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33671,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.236657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33672,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33673,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.162562,0.172788,0.0,0.0,0.0,0.0,0.0,0.197885,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Perform PCA (Dimensionality Reduction)

In [None]:
# from sklearn.decomposition import PCA
# import numpy as np

# pca = PCA(n_components=500, whiten=True)

# df_pca = pca.fit_transform(df_tf)

# np.sum(pca.explained_variance_ratio_) # Performance is bad. We should just forego PCA

Combine with label

In [None]:
df_train = pd.concat([df, df_tf], axis = 1)

df_train.drop(columns=['text'], inplace = True)

df_train

Unnamed: 0,label,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,Terrorism,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264697,0.0,0.0,0.0,0.0,0.0,0.247832,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.250475,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Crime,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.252692,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Environment,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.240902,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.257343,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Environment,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Crime,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.294465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33670,Terrorism,0.0,0.0,0.169812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33671,Other,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.236657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33672,Terrorism,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33673,Armed Conflict,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.162562,0.172788,0.0,0.0,0.0,0.0,0.0,0.197885,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Building

In [None]:
from pycaret.classification import *

exp_clf = setup(df_train, target = 'label', train_size=0.99, silent = True) 

Unnamed: 0,Description,Value
0,session_id,1615
1,Target,label
2,Target Type,Multiclass
3,Label Encoded,"Armed Conflict: 0, Crime: 1, Environment: 2, N..."
4,Original Data,"(33675, 501)"
5,Missing Values,False
6,Numeric Features,500
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
lr = create_model('lr', max_iter = 10000)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6671,0.9004,0.6361,0.66,0.6597,0.5692,0.5711
1,0.6815,0.904,0.6434,0.678,0.6746,0.5863,0.5891
2,0.6902,0.9128,0.6594,0.6819,0.6824,0.5994,0.6014
3,0.6905,0.9095,0.6618,0.6867,0.6847,0.5997,0.6016
4,0.6782,0.9076,0.6379,0.6744,0.6711,0.5824,0.5849
5,0.6818,0.9081,0.6392,0.6794,0.676,0.588,0.5899
6,0.6884,0.9133,0.6564,0.6847,0.6818,0.5962,0.5987
7,0.686,0.9086,0.6553,0.6841,0.6802,0.5929,0.5953
8,0.682,0.9068,0.645,0.6787,0.6755,0.5877,0.5899
9,0.6907,0.9101,0.6609,0.6883,0.6849,0.5994,0.6014


In [None]:
lr = finalize_model(lr)

Save the model

In [None]:
save_model(lr, 'Text_Classifier')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='label',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                 

# Benchmarking and ensembling

In [None]:
# best = compare_models(sort="Accuracy", fold=5) # 500 features

In [None]:
# mlp = create_model('mlp')

# catboost = create_model('catboost', task_type='GPU')

# lr = create_model('lr', max_iter = 10000)

# ridge = create_model('ridge')

# svm = create_model('svm')

# lda = create_model('lda')

# lgb = create_model('lightgbm')

In [None]:
# blender_specific = blend_models(estimator_list = [lr, ridge, svm, lda, lgb], method = 'hard')

In [None]:
# blender_specific = blend_models(estimator_list = [lr, ridge, svm, lda, catboost], method = 'soft')

# Testing unseen data

In [None]:
new_text = "Three ministers have bought cows by using an online marketplace, Digital Haat, for the Eid-ul-Azha. Dhaka North City Corporation, ICT Division, e-Commerce Association of Bangladesh or e-CAB, and Bangladesh Dairy Farm Association launched the platform on Saturday as part of efforts to keep people at home amid the coronavirus pandemic."

data = {'input_col': [new_text]}

df_data = pd.DataFrame(data)

df_data

Unnamed: 0,input_col
0,Three ministers have bought cows by using an o...


In [None]:
tf1 = joblib.load('tfidf.joblib') 

# Create new tfidfVectorizer with old vocabulary

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             max_features=500, strip_accents='ascii', 
                             stop_words = "english", lowercase = True,
                             vocabulary = tf1)

df_tf = tfidf_vect.fit_transform(df_data['input_col'])

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_tf

Unnamed: 0,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,arrested,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
lr = load_model('Text_Classifier')

predictions = predict_model(lr, data = df_tf)

Transformation Pipeline and Model Successfully Loaded


In [None]:
print(predictions.Label.values[0], predictions.Score.values[0])

Environment 0.9375


# Neural Network

Train-test split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df_train = df_train.sample(frac=1, random_state=2020)

features = [f for f in df_train.columns if f != 'label']
target = 'label'

X, y = df_train[features], df_train[target]

y = LabelEncoder().fit_transform(y)

# separate into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2020)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(21423, 500) (5356, 500) (21423,) (5356,)


Train Model

In [None]:
from autokeras import StructuredDataClassifier

# define the search
search = StructuredDataClassifier(max_trials=15)

# perform the search
search.fit(x=X_train, y=y_train, verbose=True, epochs=5)

Trial 13 Complete [00h 06m 14s]
val_accuracy: 0.36501988768577576

Best val_accuracy So Far: 0.45188480615615845
Total elapsed time: 01h 23m 24s
INFO:tensorflow:Oracle triggered exit


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


In [None]:
# evaluate the model
loss, acc = search.evaluate(X_val, y_val, verbose=2)
print('Accuracy: %.3f' % acc)

# Recommender System (Content Based Filtering)

Load Controls data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Psioninsights/Controls_data.csv")

df['description'] = df['description'].str.lower()

df

Unnamed: 0,control,description,control_id,label
0,Alert Police,"armed robbery, in criminal law, aggravated for...",Armed Robbery,Crime
1,Alert Police,a biological attack is the intentional release...,Biochemical Attack,Terrorism
2,Alert Police,an attack or attacks on a place or area using ...,Bomb Attack,Terrorism
3,Alert Police,the theft of an automobile from its driver by ...,Carjacking,Crime
4,Alert Police,atm fraud refers to fraud with the use of an a...,ATM Fraud,Crime


Predict the Category (label)

In [None]:
new_text = 'NOIDA: Seven men have been arrested for allegedly duping several people at ATM kiosks across Delhi-NCR, police said on Thursday. While one of the accused worked as a pickpocket and stole ATM cards from users, two others used to sell the same to two others who were directly engaged in duping ATM users at kiosks located at desolate places and those without guards.'

data = {'input_col': [new_text.lower()]}

df_data = pd.DataFrame(data)

df_data

Unnamed: 0,input_col
0,noida: seven men have been arrested for allege...


In [None]:
tf1 = joblib.load('tfidf.joblib') 

# Create new tfidfVectorizer with old vocabulary

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             max_features=500, strip_accents='ascii', 
                             stop_words = "english", lowercase = True,
                             vocabulary = tf1)

df_tf = tfidf_vect.fit_transform(df_data['input_col'])

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_tf

Unnamed: 0,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,arrested,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
lr = load_model('Text_Classifier')

predictions = predict_model(lr, data = df_tf)

category, score = predictions.Label.values[0], predictions.Score.values[0]

Transformation Pipeline and Model Successfully Loaded


In [None]:
print(category, score)

Crime 0.4636


## Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# filter
df_control = df[df['label'] == category]

# initialise new tfidf
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}',
                             max_features=1000, 
                             strip_accents='ascii',
                             stop_words = "english", 
                             lowercase = True)

tfidf_control = tfidf_vect.fit_transform(df_control['description'])

tfidf_input = tfidf_vect.transform(df_data['input_col'])


def get_similarity(input, control):

  scores = cosine_similarity(input, control)[0]

  scores = [float(i) for i in scores]

  return scores

In [None]:
def recommend(df_control):

  # top n results
  top_n = 1

  df_control['scores'] = get_similarity(tfidf_input, tfidf_control)

  df_control.sort_values('scores', ascending=False, inplace= True)

  return df_control[['control', 'control_id', 'label']][:top_n]

recommend(df_control)

Unnamed: 0,control,control_id,label
4,Alert Police,ATM Fraud,Crime


## Euclidean Distance

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

# filter
df_control = df[df['label'] == category]

def get_similarity(input, control):

  scores = euclidean_distances(input, control)[0]

  scores = [float(i) for i in scores]

  return scores

def recommend(df_control):

  # top n results
  top_n = 1

  df_control['scores'] = get_similarity(tfidf_input, tfidf_control)

  df_control.sort_values('scores', ascending=True, inplace= True)

  return df_control[['control', 'control_id', 'label']][:top_n]

recommend(df_control)

Unnamed: 0,control,control_id,label
4,Alert Police,ATM Fraud,Crime


## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors

n_neighbors = 3
KNN = NearestNeighbors(n_neighbors, p=2)

KNN.fit(tfidf_control)

result = KNN.kneighbors(tfidf_input, return_distance=True)

result = [i for i in result[0][0]]

result

[1.0250029054115573, 1.3899780890898723, 1.4142135623730951]

In [None]:
# filter
df_control = df[df['label'] == category]

def recommend(df_control):

  # top n results
  top_n = 1

  df_control['scores'] = result

  df_control.sort_values('scores', ascending=False, inplace=True)

  return df_control[['control', 'control_id', 'label']][:top_n]

recommend(df_control)

Unnamed: 0,control,control_id,label
4,Alert Police,ATM Fraud,Crime


## Spacy Word Vectors

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
# filter
df_control = df[df['label'] == category]

# clean text
df_control['description'] = clean_text(df_control['description'])
df_data['input_col'] = clean_text(df_data['input_col'])

result = []

for i in df_control['description']:
  doc1 = nlp(i)
  doc2 = nlp(df_data['input_col'][0])
  result.append(doc1.similarity(doc2))

print(result)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[0.8071099741430994, 0.719386392668716, 0.8362859538227402]


In [None]:
def recommend(df_control):

  # top n results
  top_n = 1

  df_control['scores'] = result

  df_control.sort_values('scores', ascending=False, inplace=True)

  return df_control[['control', 'control_id', 'label']][:top_n]

recommend(df_control)

Unnamed: 0,control,control_id,label
4,Alert Police,ATM Fraud,Crime
