In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Data/model_data.csv')

In [3]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, random_state=777, test_size=0.20, shuffle=True)

In [4]:
train_text = train['Processed_Text']
test_text = test['Processed_Text']

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,1), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['Processed_Text'], axis=1)
x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['Processed_Text'], axis=1)

In [6]:
def exact_match(y_test, prediction):
    return accuracy_score(y_test, prediction)

In [7]:
from sklearn.metrics import hamming_loss
def hamming_score(y_test, prediction):
    return hamming_loss(y_test, prediction)

In [16]:
from sklearn.metrics import jaccard_score
def jaccard(y_test, prediction):
    return jaccard_score(y_test, prediction, average = 'samples')

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier

In [19]:
lr_classifier = LogisticRegression(solver='sag', random_state = 7777, max_iter = 4000)
multi_target_lr = MultiOutputClassifier(lr_classifier, n_jobs=1)
lr_prediction = multi_target_lr.fit(x_train, y_train).predict(x_test);

In [20]:
exact_match(y_test, lr_prediction)

0.48211243611584326

In [21]:
hamming_score(y_test, lr_prediction)

0.014255631270111679

In [22]:
jaccard(y_test, lr_prediction)

0.5611395040696574

In [23]:
rf_classifier = RandomForestClassifier(random_state=7777)
multi_target_rf = MultiOutputClassifier(rf_classifier, n_jobs=1)
rf_prediction = multi_target_rf.fit(x_train, y_train).predict(x_test)

In [24]:
exact_match(y_test, rf_prediction)

0.5411697898921067

In [25]:
hamming_score(y_test, rf_prediction)

0.012611205754306265

In [26]:
jaccard(y_test, rf_prediction)

0.6257334847624455

In [47]:
nb_classifier = MultinomialNB(alpha = 1.0e-10)
multi_target_nb = MultiOutputClassifier(nb_classifier, n_jobs=1)
nb_prediction = multi_target_nb.fit(x_train, y_train).predict(x_test);

In [48]:
exact_match(y_test, nb_prediction)

0.49233390119250425

In [49]:
hamming_score(y_test, nb_prediction)

0.014149157675563127

In [50]:
jaccard(y_test, nb_prediction)

0.5981071360969146

In [55]:
from sklearn.multioutput import ClassifierChain
chain_lr = ClassifierChain(lr_classifier)
chain_lr_prediction = chain_lr.fit(x_train, y_train).predict(x_test)

In [56]:
exact_match(y_test, chain_lr_prediction)

0.5212947189097104

In [57]:
hamming_score(y_test, chain_lr_prediction)

0.01339201211432898

In [58]:
jaccard(y_test, chain_lr_prediction)

0.5972932046185879

In [59]:
chain_rf = ClassifierChain(rf_classifier)
chain_rf_prediction = chain_rf.fit(x_train, y_train).predict(x_test)

In [60]:
exact_match(y_test, chain_rf_prediction)

0.5406019307211811

In [61]:
hamming_score(y_test, chain_rf_prediction)

0.012552053757334848

In [62]:
jaccard(y_test, chain_rf_prediction)

0.6247870528109029

In [63]:
chain_nb = ClassifierChain(nb_classifier)
chain_nb_prediction = chain_nb.fit(x_train, y_train).predict(x_test)

In [64]:
exact_match(y_test, chain_nb_prediction)

0.48779102782509937

In [65]:
hamming_score(y_test, chain_nb_prediction)

0.014314783267083096

In [66]:
jaccard(y_test, chain_nb_prediction)

0.6002366079878857

In [67]:
key_list = list(vectorizer.vocabulary_.keys())
val_list = list(vectorizer.vocabulary_.values())

In [92]:
for (feature_index, feature) in enumerate (list(df.columns[1:])):
    coefficients = chain_lr.estimators_[feature_index].coef_[0]
    important_words_indexes = np.argpartition(coefficients, -10)[-10:]
    print(feature + ": ")
    for index in important_words_indexes:
        if (index < len(val_list)):
            print(key_list[val_list.index(index)])
    print()

'Tax': 
zuckerman
ciraolo
deputi
refund
return
incom
tax
ir
account
bank

'Coronavirus': 
ncdf
sba
ppp
fraud
pandem
coronavirus
loan
covid
19
vaccin

'False Claims Act': 
patient
unit
fha
contract
govern
claim
alleg
settlement
resolv
fals

'Health Care Fraud': 
strike
care
fraud
medic
medicar
hhs
oig
patient
medicaid
health

'Civil Rights': 
correct
disabl
constitut
hous
eric
dreiband
civil
ada
right
discrimin

'Hate Crimes': 
fire
motiv
cross
man
burn
crime
civil
african
right
hate

'Voting and Elections': 
poll
candid
committe
elect
registr
voter
polit
campaign
vote
contribut

'Environment': 
fish
pollut
natur
clean
epa
water
wildlif
environment
environ
resourc

'Human Trafficking': 
women
harbor
defend
human
minor
victim
prostitut
traffick
labor
sex

'Consumer Protection': 
product
protect
odomet
fda
consum
branch
civil
drug
food
lotteri

'Cyber Crime': 
network
cyberstalk
onlin
malwar
darknet
victim
cyber
comput
cryptocurr
hack

'Disability Rights': 
peopl
lambert
mental
right
paul