In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('processed_news_dataset.csv')

In [4]:
train, test = train_test_split(df, random_state=42, test_size=0.30, shuffle=True)
train_text = train['text'].values.astype('U')
test_text = test['text'].values.astype('U')

In [5]:
y_train = train.drop(['text','Article','Headline','Category'],axis=1)

In [6]:
y_test = test.drop(['text','Article','Headline','Category'],axis=1)

In [15]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

# LabelPowerset,countvector

In [8]:
pipe=Pipeline(
    [("vectorization",CountVectorizer()),("classifier",LabelPowerset())]
)

In [9]:
param_grid = {
                "vectorization__strip_accents": ['ascii'],
                "vectorization__analyzer": ['word'],
                "vectorization__ngram_range": [(1,1),(1,2),(1,3)],
                "vectorization__max_features": [2000,3000,4000,5000],
                "classifier__classifier": [LogisticRegression(),MultinomialNB(),LinearSVC()]
}

In [10]:
clf1 = GridSearchCV(pipe,param_grid=param_grid)

In [None]:
clf1.fit(train_text,y_train)

# LabelPowerSet, Tfidfvector

In [11]:
pipe=Pipeline(
    [("vectorization",TfidfVectorizer()),("classifier",LabelPowerset())]
)

In [12]:
param_grid = {
                "vectorization__strip_accents": ['ascii'],
                "vectorization__analyzer": ['word'],
                "vectorization__ngram_range": [(1,1),(1,2),(1,3)],
                "vectorization__max_features": [2000,3000,4000,5000],
                "classifier__classifier": [LogisticRegression(),MultinomialNB(),LinearSVC()]
}

In [13]:
clf2 = GridSearchCV(pipe,param_grid=param_grid)

In [None]:
clf2.fit(train_text,y_train)

# BinaryRelevance, TfidfVector

In [17]:
pipe=Pipeline(
    [("vectorization",TfidfVectorizer()),("classifier",BinaryRelevance())]
)

In [18]:
param_grid = {
                "vectorization__strip_accents": ['ascii'],
                "vectorization__analyzer": ['word'],
                "vectorization__ngram_range": [(1,1),(1,2),(1,3)],
                "vectorization__max_features": [2000,3000,4000,5000],
                "classifier__classifier": [LogisticRegression(),MultinomialNB(),LinearSVC()]
}

In [19]:
clf3 = GridSearchCV(pipe,param_grid=param_grid)

In [None]:
clf3.fit(train_text,y_train)

# BinaryRelevance, countVector

In [None]:
pipe=Pipeline(
    [("vectorization",CountVectorizer()),("classifier",LabelPowerset())]
)

In [None]:
param_grid = {
                "vectorization__strip_accents": ['ascii'],
                "vectorization__analyzer": ['word'],
                "vectorization__ngram_range": [(1,1),(1,2),(1,3)],
                "vectorization__max_features": [2000,3000,4000,5000],
                "classifier__classifier": [LogisticRegression(),MultinomialNB(),LinearSVC()]
}

In [None]:
clf4 = GridSearchCV(pipe,param_grid=param_grid)

In [None]:
clf4.fit(train_text,y_train)

# ClassifierChain

In [20]:
from skmultilearn.problem_transform import ClassifierChain

In [None]:
vec = TfidfVectorizer(strip_accents='ascii', analyzer='word', ngram_range=(1,3), norm='l2', max_features = 1500)

In [None]:
selected_labels = y_train.columns[y_train.sum(axis = 0, skipna = True) > 0].tolist()
y_train = y_train.filter(selected_labels, axis=1)
y_test = y_test.filter(selected_labels, axis=1)

In [None]:
vec.fit(train_text)
vec.fit(test_text)

In [None]:
x_train = vec.transform(train_text)
x_test = vec.transform(test_text)

In [None]:
clf13 = ClassifierChain(LogisticRegression(solver='sag'))
clf13.fit(x_train, y_train)
clf13_proba = clf13.predict_proba(x_test)

In [None]:
clf13_proba.shape

In [None]:
th = []
f = []
ham = []
ac = []
for t in range (5,60): # threshold value
    y_pred_new = (clf13_proba >= t/100).astype(int)
    #print("t =" ,t/100)
    #print("Accuracy = ",accuracy_score(y_test,y_pred_new))
    #print("F1 = ",f1_score(y_test,y_pred_new, average="micro"))
    #print("Hamming loss = ",hamming_loss(y_test,y_pred_new))
    th.append(t)
    ac.append(accuracy_score(y_test,y_pred_new))
    f.append(f1_score(y_test,y_pred_new, average="micro"))
    ham.append(hamming_loss(y_test,y_pred_new))

In [None]:
plt.rcParams["figure.figsize"] = (12,6)
with plt.style.context('ggplot'):
    plt.plot(th, f)
    plt.plot(th, ham)
    plt.plot(th, ac)
    plt.legend(['F1', 'Hamming loss', 'Accuracy'], loc='center left', fontsize = 14)
    plt.ylabel("metrics", fontsize = 14)
    plt.xlabel("threshold", fontsize = 14)
    plt.title("Classfier Chain Model", fontsize = 18)
plt.show()

In [12]:
grid_search.best_params_

{'clf__classifier': LinearSVC(),
 'vz__analyzer': 'word',
 'vz__max_features': 2000,
 'vz__ngram_range': (1, 1),
 'vz__strip_accents': 'ascii'}