In [None]:
!pip install pycaret

from pycaret.utils import enable_colab 
enable_colab()

In [None]:
!pip install autokeras
!pip install git+https://github.com/keras-team/keras-tuner.git@1.0.2rc4

# Load the data

In [29]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Psioninsights/riskClassificationDataset.csv')

df = df.sample(frac = 1, random_state=2020)

df.reset_index(drop=False, inplace=True)

df

Unnamed: 0,index,Control,Description,RiskType01,RiskType02,Source,Title,pubDate
0,31684,Terrorism 28,The October 2019 issue features a brief on the...,Terrorism,Terrorism,EMM,Militant Leadership Monitor – October 2019,"Sun, 23 Aug 2020 17:50:00 +0200"
1,7299,,"Rwanda National Police (RNP) on Thursday, Octo...",Crime,Human Trafficking,EMM,Three arrested for selling banned bleaching cr...,"Thu, 22 Oct 2020 19:12:00 +0200"
2,7977,Environment 102,Three ministers have bought cows by using an o...,Environment,Disease,EMM,Ministers buy Eid cattle from Digital Haat,"Sat, 11 Jul 2020 17:15:00 +0200"
3,10289,Environment 170,Hong Kong’s privacy chief criticized the U.S. ...,Environment,Disease,EMM,Hong Kong Privacy Chief Says U.S. ‘Doxxed’ San...,"Sun, 09 Aug 2020 09:19:00 +0200"
4,6084,Crime 65,The disappeared suffer crimes from kidnapping ...,Crime,Human Trafficking,EMM,"From children to young men, more than 73,000 a...","Tue, 14 Jul 2020 10:56:00 +0200"
...,...,...,...,...,...,...,...,...
33672,30648,Terrorism 6,Terrorism U.S. Efforts to Deal Islamic State “...,Terrorism,Terrorism,EMM,U.S. Efforts to Deal Islamic State “Enduring D...,"Fri, 17 Jul 2020 18:28:00 +0200"
33673,20039,,"LOUISVILLE, Ky—U.S. Customs and Border Protect...",Financial Crime,Financial Crime,EMM,"Three Days, $3.7 Million of Counterfeits Seize...","Fri, 11 Sep 2020 17:56:00 +0200"
33674,32387,,At least one person died in the capital Conakr...,Terrorism,Terrorism,EMM,Post-election violence breaks out in Guinea,"Thu, 22 Oct 2020 10:21:00 +0200"
33675,1661,,At least seven people are still missing in Bei...,Armed Conflict,Interstate Conflict,EMM,Seven people still missing one month after the...,"Sun, 30 Aug 2020 10:29:00 +0200"


In [30]:
df.RiskType01.value_counts(normalize=True)

Environment                0.340559
Crime                      0.102741
Terrorism                  0.100751
Armed Conflict             0.090240
Manmade Disaster           0.065920
Natural Disaster           0.064465
Operations                 0.058883
Financial Crime            0.046768
Project                    0.042047
Internal/External Fraud    0.031030
Civil                      0.030258
Technology                 0.026338
Name: RiskType01, dtype: float64

In [31]:
df['text'] = df.Title + df.Description

riskTypes = ['Operations', 'Environment', 'Natural Disaster', 'Crime', 'Armed Conflict', 'Terrorism']

df['label'] = ['Other' if label not in riskTypes else label for label in df.RiskType01]

In [32]:
df = df[['label', 'text']]

df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,label,text
0,Terrorism,Militant Leadership Monitor – October 2019The ...
1,Crime,Three arrested for selling banned bleaching cr...
2,Environment,Ministers buy Eid cattle from Digital HaatThre...
3,Environment,Hong Kong Privacy Chief Says U.S. ‘Doxxed’ San...
4,Crime,"From children to young men, more than 73,000 a..."
...,...,...
33670,Terrorism,U.S. Efforts to Deal Islamic State “Enduring D...
33671,Other,"Three Days, $3.7 Million of Counterfeits Seize..."
33672,Terrorism,Post-election violence breaks out in GuineaAt ...
33673,Armed Conflict,Seven people still missing one month after the...


# Text Cleaning

In [5]:
import re
from bs4 import BeautifulSoup
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def clean_text(data):

  nltk.download('wordnet')
  nltk.download('stopwords')

  stop_words = stopwords.words('english') #collate stopwords

  # Remove HTTP tags
  data = data.map(lambda x : ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))

  #Lower Case and remove leading/trailing spaces
  data = data.map(lambda x: x.lower().strip())

  #Remove punctuations
  data = data.map(lambda x: re.sub(r'[^\w\s]', '', x))

  #Remove unicodes
  data = data.map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

  #Remove numbers
  data = data.map(lambda x : ' '.join(re.sub(r'\w*\d+\w*', '', x).split()))

  # Remove stopwords
  data = data.map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

  # Lemmatize the text
  lemmer = WordNetLemmatizer()

  data = data.map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

  # Remove stopwords
  data = data.map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

  return data

df['text'] = clean_text(df['text'])

df

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,text
0,Terrorism,militant leadership monitor october october is...
1,Crime,three arrested selling banned bleaching creams...
2,Environment,minister buy eid cattle digital haatthree mini...
3,Environment,hong kong privacy chief say u doxxed sanctione...
4,Crime,child young men missing mexicothe disappeared ...
...,...,...
33670,Terrorism,u effort deal islamic state enduring defeat ho...
33671,Other,three day million counterfeit seized cbp louis...
33672,Terrorism,post election violence break guineaat least on...
33673,Armed Conflict,seven people still missing one month explosion...


# TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

 ## Feature Engineering - Create tf-idf ##
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=500, strip_accents='ascii')

df_tf =  tfidf_vect.fit_transform(df['text'])

# Save the tf-idf

joblib.dump(tfidf_vect.vocabulary_, 'tfidf.joblib') 

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_tf

Unnamed: 0,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,arrested,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264697,0.0,0.0,0.0,0.0,0.0,0.247832,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.250475,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.467658,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.252692,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.240902,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.257343,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.294465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33670,0.0,0.0,0.169812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33671,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.236657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33672,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33673,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.162562,0.172788,0.0,0.0,0.0,0.0,0.0,0.197885,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Perform PCA (Dimensionality Reduction)

In [7]:
# from sklearn.decomposition import PCA
# import numpy as np

# pca = PCA(n_components=500, whiten=True)

# df_pca = pca.fit_transform(df_tf)

# np.sum(pca.explained_variance_ratio_) # Performance is bad. We should just forego PCA

Combine with label

In [8]:
df_train = pd.concat([df, df_tf], axis = 1)

df_train.drop(columns=['text'], inplace = True)

df_train

Unnamed: 0,label,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,Terrorism,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264697,0.0,0.0,0.0,0.0,0.0,0.247832,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.250475,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Crime,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.252692,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Environment,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.240902,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.257343,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Environment,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Crime,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.294465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33670,Terrorism,0.0,0.0,0.169812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33671,Other,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.236657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33672,Terrorism,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33673,Armed Conflict,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.162562,0.172788,0.0,0.0,0.0,0.0,0.0,0.197885,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Building

In [9]:
from pycaret.classification import *

exp_clf = setup(df_train, target = 'label', train_size=0.99, silent = True) 

Unnamed: 0,Description,Value
0,session_id,634
1,Target,label
2,Target Type,Multiclass
3,Label Encoded,"Armed Conflict: 0, Crime: 1, Environment: 2, N..."
4,Original Data,"(33675, 501)"
5,Missing Values,False
6,Numeric Features,500
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [10]:
lr = create_model('lr', max_iter = 10000)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6806,0.9083,0.6433,0.6757,0.6726,0.585,0.588
1,0.6992,0.9164,0.6594,0.6927,0.6919,0.6106,0.6125
2,0.6908,0.9104,0.6575,0.6887,0.685,0.5992,0.6013
3,0.6728,0.9028,0.6385,0.6677,0.6657,0.5758,0.5783
4,0.6845,0.9074,0.6524,0.6812,0.6784,0.591,0.5932
5,0.6854,0.9062,0.6508,0.683,0.6794,0.5922,0.5943
6,0.6842,0.9068,0.6452,0.6802,0.6769,0.5898,0.5927
7,0.683,0.9071,0.6547,0.6815,0.6781,0.5899,0.5919
8,0.6721,0.9054,0.6408,0.668,0.6661,0.5756,0.5774
9,0.6823,0.908,0.658,0.6801,0.6779,0.5897,0.5912


In [11]:
lr = finalize_model(lr)

Save the model

In [12]:
save_model(lr, 'Text_Classifier')

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='label',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                 

# Benchmarking and ensembling

In [None]:
# best = compare_models(sort="Accuracy", fold=5) # 500 features

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6675,0.9161,0.5921,0.6746,0.655,0.5876,0.5972,1005.854
lr,Logistic Regression,0.6643,0.9145,0.594,0.666,0.6532,0.5861,0.5928,8.972
et,Extra Trees Classifier,0.6621,0.8763,0.5878,0.6621,0.6484,0.583,0.5901,20.386
ridge,Ridge Classifier,0.6608,0.0,0.582,0.6662,0.6463,0.5785,0.5889,0.292
lightgbm,Light Gradient Boosting Machine,0.6603,0.9143,0.5944,0.6592,0.6505,0.583,0.588,24.894
xgboost,Extreme Gradient Boosting,0.6596,0.9129,0.5854,0.6641,0.647,0.5785,0.5872,272.164
svm,SVM - Linear Kernel,0.6579,0.0,0.5921,0.6572,0.6447,0.5795,0.5861,1.742
lda,Linear Discriminant Analysis,0.6544,0.9028,0.5957,0.6628,0.6483,0.5755,0.5812,2.868
rf,Random Forest Classifier,0.6534,0.9041,0.5784,0.6542,0.6398,0.5719,0.5792,15.976
gbc,Gradient Boosting Classifier,0.6439,0.904,0.562,0.659,0.6282,0.5538,0.5692,153.812


In [None]:
# mlp = create_model('mlp')

# catboost = create_model('catboost', task_type='GPU')

# lr = create_model('lr', max_iter = 10000)

# ridge = create_model('ridge')

# svm = create_model('svm')

# lda = create_model('lda')

# lgb = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.6882,0.9148,0.6567,0.6842,0.6817,0.6062,0.608
1,0.6711,0.9117,0.6449,0.6644,0.6647,0.5857,0.5871
2,0.6714,0.9113,0.6456,0.6697,0.6664,0.5853,0.587
3,0.6922,0.9124,0.6672,0.6923,0.6878,0.6115,0.6134
4,0.6752,0.9116,0.6467,0.6709,0.6698,0.5911,0.5925
5,0.6888,0.9162,0.6531,0.6841,0.6826,0.6076,0.6092
6,0.6888,0.9161,0.6529,0.6843,0.6821,0.6072,0.609
7,0.6948,0.9129,0.6627,0.6881,0.6876,0.6152,0.6169
8,0.6873,0.9182,0.66,0.6858,0.6833,0.6062,0.6075
9,0.6858,0.9127,0.6546,0.6825,0.6803,0.6039,0.6054


In [None]:
# blender_specific = blend_models(estimator_list = [lr, ridge, svm, lda, lgb], method = 'hard')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7225,0.0,0.6987,0.7235,0.7178,0.6495,0.6517
1,0.7156,0.0,0.7052,0.7165,0.7132,0.6426,0.6436
2,0.7231,0.0,0.7023,0.7234,0.7195,0.651,0.6525
3,0.719,0.0,0.7006,0.7222,0.7157,0.6452,0.6474
4,0.716,0.0,0.6973,0.7179,0.7136,0.6424,0.6438
5,0.7246,0.0,0.7067,0.7264,0.723,0.654,0.655
6,0.7246,0.0,0.7062,0.7251,0.721,0.6531,0.6548
7,0.7224,0.0,0.7018,0.721,0.7188,0.6509,0.652
8,0.731,0.0,0.7162,0.7367,0.7299,0.6614,0.6631
9,0.7269,0.0,0.7093,0.7283,0.7239,0.656,0.6575


In [None]:
# blender_specific = blend_models(estimator_list = [lr, ridge, svm, lda, catboost], method = 'soft')

# Testing unseen data

In [46]:
new_text = "Three ministers have bought cows by using an online marketplace, Digital Haat, for the Eid-ul-Azha. Dhaka North City Corporation, ICT Division, e-Commerce Association of Bangladesh or e-CAB, and Bangladesh Dairy Farm Association launched the platform on Saturday as part of efforts to keep people at home amid the coronavirus pandemic."

data = {'input_col': [new_text]}

df_data = pd.DataFrame(data)

df_data

Unnamed: 0,input_col
0,Three ministers have bought cows by using an o...


In [47]:
tf1 = joblib.load('tfidf.joblib') 

# Create new tfidfVectorizer with old vocabulary

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             max_features=500, strip_accents='ascii', 
                             stop_words = "english", lowercase = True,
                             vocabulary = tf1)

df_tf = tfidf_vect.fit_transform(df_data['input_col'])

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_tf

Unnamed: 0,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,arrested,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.267261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
lr = load_model('Text_Classifier')

predictions = predict_model(lr, data = df_tf)

Transformation Pipeline and Model Successfully Loaded


In [50]:
print(predictions.Label.values[0], predictions.Score.values[0])

Environment 0.9375


# Neural Network

Train-test split

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df_train = df_train.sample(frac=1, random_state=2020)

features = [f for f in df_train.columns if f != 'label']
target = 'label'

X, y = df_train[features], df_train[target]

y = LabelEncoder().fit_transform(y)

# separate into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=2020)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(21423, 500) (5356, 500) (21423,) (5356,)


Train Model

In [None]:
from autokeras import StructuredDataClassifier

# define the search
search = StructuredDataClassifier(max_trials=15)

# perform the search
search.fit(x=X_train, y=y_train, verbose=True, epochs=5)

Trial 13 Complete [00h 06m 14s]
val_accuracy: 0.36501988768577576

Best val_accuracy So Far: 0.45188480615615845
Total elapsed time: 01h 23m 24s
INFO:tensorflow:Oracle triggered exit


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


In [None]:
# evaluate the model
loss, acc = search.evaluate(X_val, y_val, verbose=2)
print('Accuracy: %.3f' % acc)

# Recommender System (Content Based Filtering)

Load Controls data

In [277]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Psioninsights/riskClassificationDataset.csv')

df = df.sample(frac = 1, random_state=2020)

df.reset_index(drop=True, inplace=True)

df['text'] = df.Title + df.Description

riskTypes = ['Operations', 'Environment', 'Natural Disaster', 'Crime', 'Armed Conflict', 'Terrorism']

df['label'] = ['Other' if label not in riskTypes else label for label in df.RiskType01]

df = df[df.Control.notnull()]

df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Control,Description,RiskType01,RiskType02,Source,Title,pubDate,text,label
0,Terrorism 28,The October 2019 issue features a brief on the...,Terrorism,Terrorism,EMM,Militant Leadership Monitor – October 2019,"Sun, 23 Aug 2020 17:50:00 +0200",Militant Leadership Monitor – October 2019The ...,Terrorism
1,Environment 102,Three ministers have bought cows by using an o...,Environment,Disease,EMM,Ministers buy Eid cattle from Digital Haat,"Sat, 11 Jul 2020 17:15:00 +0200",Ministers buy Eid cattle from Digital HaatThre...,Environment
2,Environment 170,Hong Kong’s privacy chief criticized the U.S. ...,Environment,Disease,EMM,Hong Kong Privacy Chief Says U.S. ‘Doxxed’ San...,"Sun, 09 Aug 2020 09:19:00 +0200",Hong Kong Privacy Chief Says U.S. ‘Doxxed’ San...,Environment
3,Crime 65,The disappeared suffer crimes from kidnapping ...,Crime,Human Trafficking,EMM,"From children to young men, more than 73,000 a...","Tue, 14 Jul 2020 10:56:00 +0200","From children to young men, more than 73,000 a...",Crime
4,Natural Disaster 22,Headlights from a line of cars shine at dusk a...,Natural Disaster,Natural Disaster,EMM,"Powerful quake jolts Alaska towns, produces sm...","Wed, 22 Jul 2020 19:01:00 +0200","Powerful quake jolts Alaska towns, produces sm...",Natural Disaster
...,...,...,...,...,...,...,...,...,...
17643,Terrorism 31,"In a major breakthrough, the Karachi police on...",Terrorism,Terrorism,EMM,Six terrorists of 'RAW-backed separatist group...,"Sun, 19 Jul 2020 03:46:00 +0200",Six terrorists of 'RAW-backed separatist group...,Terrorism
17644,Technology 7,The US Cybersecurity and Infrastructure Securi...,Technology,General,EMM,U.S. Government Agencies Instructed to Patch W...,"Fri, 17 Jul 2020 17:56:00 +0200",U.S. Government Agencies Instructed to Patch W...,Other
17645,Environment 133,"MANILA, Philippines — President Rodrigo Dutert...",Environment,Disease,EMM,Duterte signs law on rescheduling opening of c...,"Mon, 20 Jul 2020 05:43:00 +0200",Duterte signs law on rescheduling opening of c...,Environment
17646,Internal/External Fraud 7,A man reads a message on data bundles from mob...,Internal/External Fraud,Internal/External Fraud,EMM,Econet appeals against Police’s search warrant,"Mon, 20 Jul 2020 22:51:00 +0200",Econet appeals against Police’s search warrant...,Other


Predict the Category (label)

In [278]:
new_text = 'Global Warming is causing more extreme weather conditions.'

data = {'input_col': [new_text]}

df_data = pd.DataFrame(data)

df_data

Unnamed: 0,input_col
0,Global Warming is causing more extreme weather...


In [279]:
tf1 = joblib.load('tfidf.joblib') 

# Create new tfidfVectorizer with old vocabulary

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                             max_features=500, strip_accents='ascii', 
                             stop_words = "english", lowercase = True,
                             vocabulary = tf1)

df_tf = tfidf_vect.fit_transform(df_data['input_col'])

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_tf

Unnamed: 0,access,accident,according,account,accused,across,act,action,activity,address,administration,afghanistan,africa,agency,agent,ago,aid,air,al,alleged,along,already,also,america,american,amid,among,amount,analysis,announced,another,anti,ap,application,applied,area,army,around,arrest,arrested,...,union,unit,united,university,update,use,used,using,vaccine,vehicle,via,victim,video,virus,want,war,warning,water,way,wednesday,week,well,went,west,wildfire,within,without,woman,work,worker,working,world,worth,would,xx,xxxx,year,yesterday,yet,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [280]:
lr = load_model('Text_Classifier')

predictions = predict_model(lr, data = df_tf)

category, score = predictions.Label.values[0], predictions.Score.values[0]

Transformation Pipeline and Model Successfully Loaded


In [281]:
print(category, score)

Environment 0.5579


Calculate similarity

In [282]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

def recommend(text, label):
    
    global rec
    # Matching the genre with the dataset and reset the index
    df_new = df.append({'text': text, 'label': label}, ignore_index=True)
    data = df_new.loc[df_new['label'] == label]  
    data.reset_index(level = 0, inplace = True) 
  
    # Convert the index into series
    indices = pd.Series(data.index, index = data['text'])

    # load the tf-idf vocab
    tf1 = joblib.load('tfidf.joblib') 

    # Create new tfidfVectorizer with old vocabulary
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                                max_features=500, strip_accents='ascii', 
                                stop_words = "english", lowercase = True,
                                vocabulary = tf1)
    
    tfidf_matrix = tfidf_vect.fit_transform(data['text'])
    
    # Calculating the similarity measures based on Cosine Similarity
    similiarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Get the index corresponding to original_text
    idx = indices[text]
    # Get the pairwsie similarity scores 
    sim_scores = list(enumerate(similiarity[idx]))
    # Sort the controls
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Scores of the 5 most similar controls
    sim_scores = sim_scores[:6]
    # control indicies
    control_indices = [i[0] for i in sim_scores]
   
    # Top 5 controls
    rec = data[['text', 'Control']].iloc[control_indices]
    rec = rec[rec.text != text] # prevent the original text from showing
    if len(rec) > 5:
      rec = rec.iloc[:5]
    
    return rec

In [283]:
recommend(new_text, category)

Unnamed: 0,text,Control
570,Deep-sea misconceptions cause underestimation ...,Environment 164
1733,Global targets that reveal the social–ecologic...,Environment 103
15,CDP pioneers new temperature rating of compani...,Environment 98
733,IUCN launches Global Standard to boost impact ...,Environment 115
2289,Trees Are Still Our Best Defense Against Globa...,Environment 174


Using Eudilean_distance instead of cosine

In [284]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

def recommend(text, label):
    
    global rec
    # Matching the genre with the dataset and reset the index
    df_new = df.append({'text': text, 'label': label}, ignore_index=True)
    data = df_new.loc[df_new['label'] == label]  
    data.reset_index(level = 0, inplace = True) 
  
    # Convert the index into series
    indices = pd.Series(data.index, index = data['text'])

    # load the tf-idf vocab
    tf1 = joblib.load('tfidf.joblib') 

    # Create new tfidfVectorizer with old vocabulary
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                                max_features=500, strip_accents='ascii', 
                                stop_words = "english", lowercase = True,
                                vocabulary = tf1)
    
    tfidf_matrix = tfidf_vect.fit_transform(data['text'])
    
    # Calculating the similarity measures based on Euclidean_distances
    similiarity = euclidean_distances(tfidf_matrix, tfidf_matrix)
    
    # Get the index corresponding to original_text
    idx = indices[text]
    # Get the pairwsie similarity scores 
    sim_scores = list(enumerate(similiarity[idx]))
    # Sort the controls
    sim_scores = sorted(sim_scores, key=lambda x: x[1])
    # Scores of the 5 most similar controls
    sim_scores = sim_scores[:6]
    # control indicies
    control_indices = [i[0] for i in sim_scores]
   
    # Top 5 controls
    rec = data[['text', 'Control']].iloc[control_indices]
    rec = rec[rec.text != text] # prevent the original text from showing
    if len(rec) > 5:
      rec = rec.iloc[:5]
    
    return rec

In [285]:
recommend(new_text, category)

Unnamed: 0,text,Control
570,Deep-sea misconceptions cause underestimation ...,Environment 164
1733,Global targets that reveal the social–ecologic...,Environment 103
15,CDP pioneers new temperature rating of compani...,Environment 98
733,IUCN launches Global Standard to boost impact ...,Environment 115
2289,Trees Are Still Our Best Defense Against Globa...,Environment 174
