In [None]:
!pip install pycaret

from pycaret.utils import enable_colab 
enable_colab()

# Load the data

In [25]:
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Psioninsights/riskClassificationDataset.csv')

df = df.sample(frac = 1, random_state=2020)

df.reset_index(drop=False, inplace=True)

df

Unnamed: 0,index,Control,Description,RiskType01,RiskType02,Source,Title,pubDate
0,17635,,A motorcyclist has been rushed to hospital aft...,Manmade Disaster,Manmade Disaster,EMM,The Met Police are carrying out enquiries,"Fri, 28 Aug 2020 17:06:00 +0200"
1,4986,Crime 6,A case has been filed against Mohammad Shahid ...,Crime,Human Trafficking,EMM,"Case filed against MP Papul, associates with M...","Thu, 09 Jul 2020 11:25:00 +0200"
2,24165,Terrorism 25,Principal Findings. What’s new? Lake Chad basi...,Terrorism,Terrorism,EMM,Nigeria: What role for the Multinational Joint...,"Tue, 07 Jul 2020 16:39:00 +0200"
3,13112,Environment 172,Both the US Food and Drug Administration and U...,Environment,General,EMM,FDA and consumers might go for ‘cell-based sea...,"Wed, 29 Jul 2020 16:24:00 +0200"
4,11340,,Country becomes first in Western Europe to hit...,Environment,Disease,EMM,Coronavirus cases in Spain top one million as ...,"Thu, 22 Oct 2020 11:59:00 +0200"
...,...,...,...,...,...,...,...,...
26776,18523,Natural Disaster 36,City reports no casualties; temblor is result ...,Natural Disaster,Natural Disaster,EMM,Tangshan earthquake aftershock of 1976 tremblo...,"Mon, 13 Jul 2020 03:01:00 +0200"
26777,11971,,It is also reported that the government will i...,Environment,Disease,EMM,National system to inform Ukrainians about sta...,"Wed, 18 Nov 2020 14:28:00 +0100"
26778,14966,Financial Crime 1,continues to be one of the acclaimed internati...,Financial Crime,Financial Crime,EMM,"Judge Malcolm Simmons, respected international...","Sat, 08 Aug 2020 16:34:00 +0200"
26779,7491,Environment 149,"By JEFF AMY and BEN NADLER, Associated Press. ...",Environment,Disease,EMM,Georgia gov sues to end cities' defiance on ma...,"Fri, 17 Jul 2020 10:11:00 +0200"


In [26]:
df.RiskType01.value_counts(normalize=True)

Environment                0.289646
Crime                      0.113215
Terrorism                  0.098129
Armed Conflict             0.091931
Operations                 0.074045
Natural Disaster           0.069676
Manmade Disaster           0.065532
Financial Crime            0.051940
Project                    0.045891
Civil                      0.035137
Internal/External Fraud    0.035100
Technology                 0.029760
Name: RiskType01, dtype: float64

In [27]:
df['text'] = df.Title + df.Description

riskTypes = ['Operations', 'Environment', 'Natural Disaster', 'Crime', 'Armed Conflict', 'Terrorism']

df['label'] = ['Other' if label not in riskTypes else label for label in df.RiskType01]

df = df[['label', 'text']]

df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,label,text
0,Other,The Met Police are carrying out enquiriesA mot...
1,Crime,"Case filed against MP Papul, associates with M..."
2,Terrorism,Nigeria: What role for the Multinational Joint...
3,Environment,FDA and consumers might go for ‘cell-based sea...
4,Environment,Coronavirus cases in Spain top one million as ...
...,...,...
26774,Natural Disaster,Tangshan earthquake aftershock of 1976 tremblo...
26775,Environment,National system to inform Ukrainians about sta...
26776,Other,"Judge Malcolm Simmons, respected international..."
26777,Environment,Georgia gov sues to end cities' defiance on ma...


# Text Cleaning

In [28]:
import re
from bs4 import BeautifulSoup

# Remove HTTP tags
df['text'] = df['text'].map(lambda x : ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x).split()))

#Lower Case and remove leading/trailing spaces
df['text']= df['text'].map(lambda x: x.lower().strip())

#Remove punctuations
df['text']= df['text'].map(lambda x: re.sub(r'[^\w\s]', '', x))

#Remove unicodes
df['text']= df['text'].map(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

#Remove numbers
df['text'] = df['text'].map(lambda x : ' '.join(re.sub(r'\w*\d+\w*', '', x).split()))

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

Unnamed: 0,label,text
0,Other,the met police are carrying out enquiriesa mot...
1,Crime,case filed against mp papul associates with mo...
2,Terrorism,nigeria what role for the multinational joint ...
3,Environment,fda and consumers might go for cell based seaf...
4,Environment,coronavirus cases in spain top one million as ...
...,...,...
26774,Natural Disaster,tangshan earthquake aftershock of tremblor exp...
26775,Environment,national system to inform ukrainians about sta...
26776,Other,judge malcolm simmons respected international ...
26777,Environment,georgia gov sues to end cities defiance on mas...


In [29]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

stop_words = stopwords.words('english') #collate stopwords

# Remove stopwords
df['text']= df['text'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

# Lemmatize the text
lemmer = WordNetLemmatizer()

df['text']= df['text'].map(lambda x : ' '.join([lemmer.lemmatize(w) for w in x.split() if w not in stop_words]))

# Remove stopwords
df['text']= df['text'].map(lambda x : ' '.join([w for w in x.split() if w not in stop_words]))

df

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,text
0,Other,met police carrying enquiriesa motorcyclist ru...
1,Crime,case filed mp papul associate motijheel police...
2,Terrorism,nigeria role multinational joint task force fi...
3,Environment,fda consumer might go cell based seafood label...
4,Environment,coronavirus case spain top one million pandemi...
...,...,...
26774,Natural Disaster,tangshan earthquake aftershock tremblor expert...
26775,Environment,national system inform ukrainian staying room ...
26776,Other,judge malcolm simmons respected international ...
26777,Environment,georgia gov sue end city defiance mask rulesby...


TF-IDF

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

 ## Feature Engineering - Create tf-idf ##
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1000, strip_accents='ascii')

df_tf =  tfidf_vect.fit_transform(df['text'])

df_tf = pd.DataFrame(df_tf.toarray(), columns=tfidf_vect.get_feature_names())

df_train = pd.concat([df, df_tf], axis = 1)

df_train.drop(columns=['text'], inplace = True)

df_train

Unnamed: 0,label,able,access,accident,according,account,accused,across,act,action,activity,actor,added,additional,address,administration,advised,affair,affected,afghan,afghanistan,afp,africa,african,afternoon,agency,agent,ago,agreed,agreement,ahead,aid,air,airport,al,alert,alleged,allegedly,allowed,almost,...,warning,washington,watch,water,wave,way,weapon,weather,website,wednesday,week,weekend,well,went,west,western,whether,white,wife,wildfire,wildlife,wind,within,without,woman,work,worker,working,world,worth,would,written,xinhua,xx,xxxx,year,yesterday,yet,york,young
0,Other,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.225704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Crime,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Terrorism,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.181873,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Environment,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.218562,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Environment,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26774,Natural Disaster,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26775,Environment,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333562,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26776,Other,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.161834,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26777,Environment,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Benchmarking

In [35]:
from pycaret.classification import * 

exp_clf = setup(df_train, target = 'label', train_size=0.99, pca = True) 

Unnamed: 0,Description,Value
0,session_id,7895
1,Target,label
2,Target Type,Multiclass
3,Label Encoded,"Armed Conflict: 0, Crime: 1, Environment: 2, N..."
4,Original Data,"(26779, 1001)"
5,Missing Values,False
6,Numeric Features,1000
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [None]:
# best = compare_models(sort="Accuracy", fold=5) # 500 features

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.6675,0.9161,0.5921,0.6746,0.655,0.5876,0.5972,1005.854
lr,Logistic Regression,0.6643,0.9145,0.594,0.666,0.6532,0.5861,0.5928,8.972
et,Extra Trees Classifier,0.6621,0.8763,0.5878,0.6621,0.6484,0.583,0.5901,20.386
ridge,Ridge Classifier,0.6608,0.0,0.582,0.6662,0.6463,0.5785,0.5889,0.292
lightgbm,Light Gradient Boosting Machine,0.6603,0.9143,0.5944,0.6592,0.6505,0.583,0.588,24.894
xgboost,Extreme Gradient Boosting,0.6596,0.9129,0.5854,0.6641,0.647,0.5785,0.5872,272.164
svm,SVM - Linear Kernel,0.6579,0.0,0.5921,0.6572,0.6447,0.5795,0.5861,1.742
lda,Linear Discriminant Analysis,0.6544,0.9028,0.5957,0.6628,0.6483,0.5755,0.5812,2.868
rf,Random Forest Classifier,0.6534,0.9041,0.5784,0.6542,0.6398,0.5719,0.5792,15.976
gbc,Gradient Boosting Classifier,0.6439,0.904,0.562,0.659,0.6282,0.5538,0.5692,153.812


In [None]:
best = compare_models(sort="Accuracy", fold=5) # 1000 features

In [None]:
# mlp = create_model('mlp')

# catboost = create_model('catboost', task_type='GPU')

lr = create_model('lr', max_iter = 10000)

ridge = create_model('ridge')

svm = create_model('svm')

lda = create_model('lda')

lgb = create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7202,0.916,0.7044,0.7248,0.7185,0.648,0.6494
1,0.7073,0.9128,0.6993,0.714,0.7078,0.6328,0.6338
2,0.7201,0.9184,0.6989,0.7255,0.7189,0.6475,0.6491
3,0.716,0.9167,0.7001,0.7219,0.7148,0.642,0.6435
4,0.7046,0.9133,0.6876,0.7102,0.7043,0.6288,0.6299
5,0.7231,0.9188,0.7058,0.7292,0.7232,0.6524,0.6534
6,0.7246,0.9199,0.7088,0.733,0.7237,0.6531,0.6549
7,0.7141,0.9158,0.6962,0.7188,0.7134,0.6408,0.6419
8,0.7261,0.9212,0.7159,0.7343,0.7263,0.6558,0.6576
9,0.7261,0.9192,0.7147,0.7327,0.726,0.656,0.6573


IntProgress(value=0, description='Processing: ', max=4)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


In [None]:
blender_specific = blend_models(estimator_list = [lr, ridge, svm, lda, lgb], method = 'hard')

In [None]:
blender_specific = blend_models(estimator_list = [lr, ridge, svm, lda, catboost], method = 'soft')

# Neural Network