# Class level multilabelling

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd 

In [None]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
df = pd.read_json('/content/drive/MyDrive/classification/dataset/2015_USPTO.json')
from sklearn.utils import shuffle
df = shuffle(df,random_state=101)

In [None]:
df.head()

Unnamed: 0,Subclass_labels,Abstract,Title,No
5646,"[H03M, G06F]",the present inventions are related to systems ...,error injection for ldpc retry validation,US08930794
45777,"[H01G, G01C, G01P]",the present invention relates to a variable ar...,variable area capacitor structure comb grid ca...,US08971012
20048,[A61B],a closure device and system for closing openin...,large hole closure device,US08945180
5582,[G06F],the present invention features a personal comp...,personal computing device having single cell b...,US08930727
34818,"[G01N, G01M, F02D]",a method of determining an air fuel ratio of a...,oxygen sensing method and apparatus,US08959987


In [None]:
# To apply class label from column we are converting label into string
df['Subclass_labels'] = df['Subclass_labels'].apply(str)

In [None]:
df.head()

Unnamed: 0,Subclass_labels,Abstract,Title,No
5646,"['H03M', 'G06F']",the present inventions are related to systems ...,error injection for ldpc retry validation,US08930794
45777,"['H01G', 'G01C', 'G01P']",the present invention relates to a variable ar...,variable area capacitor structure comb grid ca...,US08971012
20048,['A61B'],a closure device and system for closing openin...,large hole closure device,US08945180
5582,['G06F'],the present invention features a personal comp...,personal computing device having single cell b...,US08930727
34818,"['G01N', 'G01M', 'F02D']",a method of determining an air fuel ratio of a...,oxygen sensing method and apparatus,US08959987


In [None]:
# Applying regular expression to the subclass for extracting class labels 
import re
cleaned_labels = [tuple(re.sub("\[|\]|\'", '', x).split(', ')) for x in df.Subclass_labels.values]

In [None]:
df_classes = []
for label_tuple in cleaned_labels:
  list_lbl = list(label_tuple)
  arr_class = []
  for indv_lbl in list_lbl:
    if indv_lbl[0] not in arr_class:
      arr_class.append(indv_lbl[0])
  df_classes.append(tuple(arr_class))
df.Subclass_labels = df_classes  

In [None]:
df.head(10)

Unnamed: 0,Subclass_labels,Abstract,Title,No
5646,"(H, G)",the present inventions are related to systems ...,error injection for ldpc retry validation,US08930794
45777,"(H, G)",the present invention relates to a variable ar...,variable area capacitor structure comb grid ca...,US08971012
20048,"(A,)",a closure device and system for closing openin...,large hole closure device,US08945180
5582,"(G,)",the present invention features a personal comp...,personal computing device having single cell b...,US08930727
34818,"(G, F)",a method of determining an air fuel ratio of a...,oxygen sensing method and apparatus,US08959987
32179,"(H,)",a switching circuit a charge sense amplifier a...,switching circuit charge sense amplifier inclu...,US08957361
11541,"(B,)",a composite layup is formed on a tool and plac...,method for forming and applying composite layu...,US08936695
17811,"(G, B)",an apparatus for determining a state parameter...,apparatus and method for determining a state p...,US08942949
30895,"(E,)",in a method and a device for the treatment of ...,method for the treatment of layers as well as ...,US08956076
26000,"(A,)",a hip rotation training system for assisting i...,hip rotation training system,US08951136


In [None]:
# Combining abstract and title 
abstracts = df.Abstract.values
titles = df.Title.values

In [None]:
for i in range(len(abstracts)):
  abstracts[i] = titles[i] + ' : ' + abstracts[i]

In [None]:
df.Abstract = abstracts

In [None]:
df.head()

Unnamed: 0,Subclass_labels,Abstract,Title,No
5646,"(H, G)",error injection for ldpc retry validation : th...,error injection for ldpc retry validation,US08930794
45777,"(H, G)",variable area capacitor structure comb grid ca...,variable area capacitor structure comb grid ca...,US08971012
20048,"(A,)",large hole closure device : a closure device a...,large hole closure device,US08945180
5582,"(G,)",personal computing device having single cell b...,personal computing device having single cell b...,US08930727
34818,"(G, F)",oxygen sensing method and apparatus : a method...,oxygen sensing method and apparatus,US08959987


In [None]:
df =df[reversed(df.columns)]
df.head()

Unnamed: 0,No,Title,Abstract,Subclass_labels
5646,US08930794,error injection for ldpc retry validation,error injection for ldpc retry validation : th...,"(H, G)"
45777,US08971012,variable area capacitor structure comb grid ca...,variable area capacitor structure comb grid ca...,"(H, G)"
20048,US08945180,large hole closure device,large hole closure device : a closure device a...,"(A,)"
5582,US08930727,personal computing device having single cell b...,personal computing device having single cell b...,"(G,)"
34818,US08959987,oxygen sensing method and apparatus,oxygen sensing method and apparatus : a method...,"(G, F)"


In [None]:
# Multilabelbinarizar to convert subclass labels column to binary labels 
mlb = MultiLabelBinarizer()

In [None]:
# Applying the fit transformer to the subclass label column and genrating columns for them
df[mlb.classes_] = mlb.fit_transform(df['Subclass_labels'])

In [None]:
label_cols = list(mlb.classes_)

In [None]:
df.head()

Unnamed: 0,No,Title,Abstract,Subclass_labels,A,B,C,D,E,F,G,H
5646,US08930794,error injection for ldpc retry validation,error injection for ldpc retry validation : th...,"(H, G)",0,0,0,0,0,0,1,1
45777,US08971012,variable area capacitor structure comb grid ca...,variable area capacitor structure comb grid ca...,"(H, G)",0,0,0,0,0,0,1,1
20048,US08945180,large hole closure device,large hole closure device : a closure device a...,"(A,)",1,0,0,0,0,0,0,0
5582,US08930727,personal computing device having single cell b...,personal computing device having single cell b...,"(G,)",0,0,0,0,0,0,1,0
34818,US08959987,oxygen sensing method and apparatus,oxygen sensing method and apparatus : a method...,"(G, F)",0,0,0,0,0,1,1,0


In [None]:
# Spliting train test size 
TRAIN_VAL_RATIO = 0.9
LEN = df.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)

train = df[:SIZE_TRAIN]
test = df[SIZE_TRAIN:]

In [None]:
print(train.shape)
print(test.shape)

(44910, 12)
(4990, 12)


In [None]:
print('checking label they have only class 0 in labels in train \n')
for label in label_cols:
  ex = 1 in list(train[label])
  if ex == False:
    print(label)

print('checking label they have only class 0 in labels in test \n')
for label in label_cols:
  ex = 1 in list(test[label])
  if ex == False:
    print(label)   

checking label they have only class 0 in labels in train 

checking label they have only class 0 in labels in test 



In [None]:
list(mlb.classes_)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

In [None]:
label_cols  = list(mlb.classes_)

In [None]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

In [None]:
def tokenize(s): 
  return re_tok.sub(r' \1 ', s).split()

In [None]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [None]:
trn_term_doc = vec.fit_transform(train['Abstract'])
test_term_doc = vec.transform(test['Abstract'])

In [None]:
trn_term_doc, test_term_doc

(<44910x174030 sparse matrix of type '<class 'numpy.float64'>'
 	with 5327115 stored elements in Compressed Sparse Row format>,
 <4990x174030 sparse matrix of type '<class 'numpy.float64'>'
 	with 578335 stored elements in Compressed Sparse Row format>)

In [None]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
x = trn_term_doc
test_x = test_term_doc

In [None]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=False, max_iter=1000)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
preds = np.zeros((len(test), len(label_cols)))

In [None]:
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit A
fit B
fit C
fit D
fit E
fit F
fit G
fit H


In [None]:
print(preds)

[[3.64375819e-03 1.81673894e-02 2.18872010e-03 ... 5.28716707e-03
  3.22206779e-01 8.28801911e-01]
 [9.00032618e-03 2.54708279e-02 1.99511053e-04 ... 1.12545314e-01
  7.12055812e-01 5.47458021e-01]
 [4.25045299e-02 1.44505818e-01 6.88323693e-03 ... 2.17877476e-02
  4.14003889e-01 1.86307309e-01]
 ...
 [1.34154721e-02 2.13601060e-01 1.62492664e-03 ... 8.68449432e-01
  2.61762377e-01 5.17222778e-02]
 [2.42783336e-02 9.43146482e-01 2.84773087e-03 ... 3.07418153e-01
  2.86885931e-01 4.26066560e-03]
 [4.03430221e-03 5.17214774e-02 1.18547474e-03 ... 9.15148214e-03
  9.51813645e-01 2.58682726e-01]]


In [None]:
y_te_true = test[label_cols].values
print(y_te_true)

[[0 0 0 ... 0 0 1]
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 1 0]
 ...
 [0 1 0 ... 1 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]


In [None]:
def get_pred(x):
  if x >= 0.4:
    return 1  
  return 0

In [None]:
y_te_pred = [[get_pred(pred) for pred in row] for row in preds]

In [None]:
print(y_te_pred)

[[0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1], [0, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 1, 1], [1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0

In [None]:
from sklearn.metrics import f1_score, precision_score, accuracy_score

In [None]:
f1_score(y_te_true, y_te_pred, average='micro')

0.7842755522276302

In [None]:
precision_score(y_te_true, y_te_pred, average='micro')

0.8100541376643465

In [None]:
accuracy_score(y_te_true, y_te_pred) 

0.5601202404809619

for 606 labels 

In [1]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd 

In [2]:
# # experiment
# import statsmodels.api as sm
# from sklearn.multioutput import MultiOutputClassifier
# from sklearn.svm import LinearSVC

In [3]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
df_2006 = pd.read_json('2006_USPTO.json')
df_2007 = pd.read_json('2007_USPTO.json')
df_2008 = pd.read_json('2008_USPTO.json')
df_2009 = pd.read_json('2009_USPTO.json')
df_2010 = pd.read_json('2010_USPTO.json')
df_2011 = pd.read_json('2011_USPTO.json')
df_2012 = pd.read_json('2012_USPTO.json')
df_2013 = pd.read_json('2013_USPTO.json')
df_2014 = pd.read_json('2014_USPTO.json')
df_2015 = pd.read_json('2015_USPTO.json')

# df = pd.read_json('/content/drive/MyDrive/classification/dataset/2015_USPTO.json')

df = pd.concat([df_2006, df_2007, df_2008, df_2009, df_2010, df_2011, df_2012, df_2013, df_2014, df_2015], axis=0, ignore_index=True)
from sklearn.utils import shuffle
df = shuffle(df,random_state=101)

In [5]:
df.head()

Unnamed: 0,Subclass_labels,Abstract,Title,No
778986,[B05B],a spray gun has a fluid reservoir which contai...,apparatus for spraying liquids and disposable ...,US07798426
1295732,[H01J],a semiconductor light emitting device includes...,semiconductor light emitting device,US08106579
1244791,[F23Q],a fire starter device for survival or emergenc...,survival tool fire starter with mischmetal fli...,US08186995
613827,[A61B],a multi layer method for detecting atrial arrh...,method and apparatus for detection of tachyarr...,US07537569
594040,[B65B],a wrapping device is mounted at an exterior of...,wrapping device,US07478517


In [6]:
# df['Subclass_labels'] = df['Subclass_labels'].apply(str)

In [7]:
# For an exprriment 
df = df[:1000000]

In [8]:
df.head()

Unnamed: 0,Subclass_labels,Abstract,Title,No
778986,[B05B],a spray gun has a fluid reservoir which contai...,apparatus for spraying liquids and disposable ...,US07798426
1295732,[H01J],a semiconductor light emitting device includes...,semiconductor light emitting device,US08106579
1244791,[F23Q],a fire starter device for survival or emergenc...,survival tool fire starter with mischmetal fli...,US08186995
613827,[A61B],a multi layer method for detecting atrial arrh...,method and apparatus for detection of tachyarr...,US07537569
594040,[B65B],a wrapping device is mounted at an exterior of...,wrapping device,US07478517


In [9]:
abstracts = df.Abstract.values
titles = df.Title.values

In [10]:
for i in range(len(abstracts)):
  abstracts[i] = titles[i] + ' : ' + abstracts[i]

In [11]:
df.Abstract = abstracts

In [12]:
df.head()

Unnamed: 0,Subclass_labels,Abstract,Title,No
778986,[B05B],apparatus for spraying liquids and disposable ...,apparatus for spraying liquids and disposable ...,US07798426
1295732,[H01J],semiconductor light emitting device : a semico...,semiconductor light emitting device,US08106579
1244791,[F23Q],survival tool fire starter with mischmetal fli...,survival tool fire starter with mischmetal fli...,US08186995
613827,[A61B],method and apparatus for detection of tachyarr...,method and apparatus for detection of tachyarr...,US07537569
594040,[B65B],wrapping device : a wrapping device is mounted...,wrapping device,US07478517


In [13]:
df =df[reversed(df.columns)]
df.head()

Unnamed: 0,No,Title,Abstract,Subclass_labels
778986,US07798426,apparatus for spraying liquids and disposable ...,apparatus for spraying liquids and disposable ...,[B05B]
1295732,US08106579,semiconductor light emitting device,semiconductor light emitting device : a semico...,[H01J]
1244791,US08186995,survival tool fire starter with mischmetal fli...,survival tool fire starter with mischmetal fli...,[F23Q]
613827,US07537569,method and apparatus for detection of tachyarr...,method and apparatus for detection of tachyarr...,[A61B]
594040,US07478517,wrapping device,wrapping device : a wrapping device is mounted...,[B65B]


In [14]:
# df = df[:100000]

In [15]:
df.shape

(1000000, 4)

In [16]:
mlb = MultiLabelBinarizer()

In [17]:
df[mlb.classes_] = mlb.fit_transform(df['Subclass_labels'])

In [18]:
df.head()

Unnamed: 0,No,Title,Abstract,Subclass_labels,A01B,A01C,A01D,A01F,A01G,A01H,...,H04Q,H04R,H04S,H04W,H05B,H05C,H05F,H05G,H05H,H05K
778986,US07798426,apparatus for spraying liquids and disposable ...,apparatus for spraying liquids and disposable ...,[B05B],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295732,US08106579,semiconductor light emitting device,semiconductor light emitting device : a semico...,[H01J],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1244791,US08186995,survival tool fire starter with mischmetal fli...,survival tool fire starter with mischmetal fli...,[F23Q],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613827,US07537569,method and apparatus for detection of tachyarr...,method and apparatus for detection of tachyarr...,[A61B],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594040,US07478517,wrapping device,wrapping device : a wrapping device is mounted...,[B65B],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
a = list(df)
a.remove('No')
a.remove('Title')
a.remove('Abstract')
a.remove('Subclass_labels')

l = range(0, len(df.columns)-4)
df1 = pd.DataFrame({'Label': a, 'Coded':l})

In [20]:
df = df.rename(columns=dict(zip(df1["Label"], df1["Coded"])))

In [21]:
df.head()

Unnamed: 0,No,Title,Abstract,Subclass_labels,0,1,2,3,4,5,...,618,619,620,621,622,623,624,625,626,627
778986,US07798426,apparatus for spraying liquids and disposable ...,apparatus for spraying liquids and disposable ...,[B05B],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1295732,US08106579,semiconductor light emitting device,semiconductor light emitting device : a semico...,[H01J],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1244791,US08186995,survival tool fire starter with mischmetal fli...,survival tool fire starter with mischmetal fli...,[F23Q],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613827,US07537569,method and apparatus for detection of tachyarr...,method and apparatus for detection of tachyarr...,[A61B],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
594040,US07478517,wrapping device,wrapping device : a wrapping device is mounted...,[B65B],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
print(df1.head())

  Label  Coded
0  A01B      0
1  A01C      1
2  A01D      2
3  A01F      3
4  A01G      4


In [23]:
df = shuffle(df,random_state=100)

In [24]:
TRAIN_VAL_RATIO = 0.8
LEN = df.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)

train = df[:SIZE_TRAIN]
test = df[SIZE_TRAIN:]

In [25]:
print(train.shape)
print(test.shape)

(800000, 632)
(200000, 632)


In [26]:
label_cols  = list(df1['Coded'])
print(label_cols)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [27]:
cols_to_drop = []
print('checking label they have only class 0 in labels in train \n')
for label in label_cols:
  ex = 1 in list(train[label])
  if ex == False:
    print(label)
    cols_to_drop.append(label)

print('checking label they have only class 0 in labels in test \n')
for label in label_cols:
  ex = 1 in list(test[label])
  if ex == False:
    print(label)    
    cols_to_drop.append(label)

checking label they have only class 0 in labels in train 

294
336
checking label they have only class 0 in labels in test 

83
296
309
318
359
361
448
561


In [28]:
print(cols_to_drop)
print(len(cols_to_drop))

[294, 336, 83, 296, 309, 318, 359, 361, 448, 561]
10


In [29]:
# Dropping those specific columns having just single class
train = train.drop(cols_to_drop, axis = 1)
test = test.drop(cols_to_drop, axis = 1)

In [30]:
# Dropping those columns from list of column labels 
label_cols = set(label_cols) - set(cols_to_drop)
label_cols = list(label_cols)
print(type(label_cols))
print(label_cols)

<class 'list'>
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219

In [31]:
print(len(label_cols))

618


In [32]:
# After dropping columns having just one label check again
print('checking label they have only class 0 in labels in train \n')
for label in label_cols:
  ex = 1 in list(train[label])
  if ex == False:
    print(label)

print('checking label they have only class 0 in labels in test \n')
for label in label_cols:
  ex = 1 in list(test[label])
  if ex == False:
    print(label)

checking label they have only class 0 in labels in train 

checking label they have only class 0 in labels in test 



In [33]:
# print y_train[0:10]
# y_train.to_csv(path='ytard.csv')

# if len(np.sum(y_train)) in [len(y_train),0]:
#     print "all one class"
#     #do something else
# else:
#     #OK to proceed
#     lenreg.fit(X_train, y_train)
#     y_pred = lenreg.predict(X_test)
#     print metics.accuracy_score(y_test, y_pred)

In [34]:
# exists = 1 in y_train['H03F']
# print(exists)

In [35]:
# print(y_test)

In [36]:
# if len(np.sum(y_test)) in [len(y_test),0]:
#     print("all one class")
#     #do something else
# else:
#     #OK to proceed
#     # lenreg.fit(X_train, y_train)
#     # y_pred = lenreg.predict(X_test)
#     # print metics.accuracy_score(y_test, y_pred)
#     print('okay')

In [37]:
# counter = 0
# for c in label_cols:
#   exists = 0 in y_test[c]
#   # print(exists)
#   # break
#   if exists == False:
#     # print(c)
#     counter +=1

In [38]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

In [39]:
def tokenize(s): 
  return re_tok.sub(r' \1 ', s).split()

In [40]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

In [41]:
trn_term_doc = vec.fit_transform(train['Abstract'])
test_term_doc = vec.transform(test['Abstract'])

In [42]:
trn_term_doc, test_term_doc

(<800000x1625433 sparse matrix of type '<class 'numpy.float64'>'
 	with 107242511 stored elements in Compressed Sparse Row format>,
 <200000x1625433 sparse matrix of type '<class 'numpy.float64'>'
 	with 26475217 stored elements in Compressed Sparse Row format>)

In [43]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [44]:
x = trn_term_doc
test_x = test_term_doc

In [45]:
# experiment 
# def get_mdl(y):
#     y = y.values
#     r = np.log(pr(1,y) / pr(0,y))
#     m = LinearSVC(random_state=42)
#     m = MultiOutputClassifier(m, n_jobs=-1)
#     x_nb = x.multiply(r)
#     return m.fit(x_nb, y), r

In [46]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=False, solver='liblinear')
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [47]:
preds = np.zeros((len(test), len(label_cols)))

In [48]:
# print y_train[0:10]
# y_train.to_csv(path='ytard.csv')

# if len(np.sum(y_train)) in [len(y_train),0]:
#     print "all one class"
#     #do something else
# else:
#     #OK to proceed
#     lenreg.fit(X_train, y_train)
#     y_pred = lenreg.predict(X_test)
#     print metics.accuracy_score(y_test, y_pred)

In [51]:
import datetime
from datetime import datetime
start = datetime.now()
print(start)

2021-12-26 20:48:07.184468


In [52]:
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit 0
fit 1
fit 2
fit 3
fit 4
fit 5
fit 6
fit 7
fit 8
fit 9
fit 10
fit 11
fit 12
fit 13
fit 14
fit 15
fit 16
fit 17
fit 18
fit 19
fit 20
fit 21
fit 22
fit 23
fit 24
fit 25
fit 26
fit 27
fit 28
fit 29
fit 30
fit 31
fit 32
fit 33
fit 34
fit 35
fit 36
fit 37
fit 38
fit 39
fit 40
fit 41
fit 42
fit 43
fit 44
fit 45
fit 46
fit 47
fit 48
fit 49
fit 50
fit 51
fit 52
fit 53
fit 54
fit 55
fit 56
fit 57
fit 58
fit 59
fit 60
fit 61
fit 62
fit 63
fit 64
fit 65
fit 66
fit 67
fit 68
fit 69
fit 70
fit 71
fit 72
fit 73
fit 74
fit 75
fit 76
fit 77
fit 78
fit 79
fit 80
fit 81
fit 82
fit 84
fit 85
fit 86
fit 87
fit 88
fit 89
fit 90
fit 91
fit 92
fit 93
fit 94
fit 95
fit 96
fit 97
fit 98
fit 99
fit 100
fit 101
fit 102
fit 103
fit 104
fit 105
fit 106
fit 107
fit 108
fit 109
fit 110
fit 111
fit 112
fit 113
fit 114
fit 115
fit 116
fit 117
fit 118
fit 119
fit 120
fit 121
fit 122
fit 123
fit 124
fit 125
fit 126
fit 127
fit 128
fit 129
fit 130
fit 131
fit 132
fit 133
fit 134
fit 135
fit 136
fit 137
fit 138
fit 1

In [53]:
finish  = datetime.now()

print(finish-start)

13:18:08.646439


In [54]:
#np.savetxt('preds.txt',preds)

In [55]:
#print(type(preds))
#print(preds.shape)
#print(preds)

In [56]:
y_te_true = test[label_cols].values
print(y_te_true)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [51]:
#np.savetxt('y_te_true.txt',preds)

In [57]:
print(preds[0])
print(y_te_true[0])

[2.15948171e-05 1.69717513e-05 2.28427278e-05 7.60024332e-06
 3.38807349e-05 1.97763518e-05 2.00846954e-05 8.59565853e-05
 4.11127099e-06 1.55344175e-05 4.28619688e-05 8.01057808e-06
 8.80282224e-06 8.92431153e-06 7.88576422e-06 5.69014917e-06
 1.71816221e-05 9.02395457e-06 1.05223599e-05 7.35210800e-06
 6.34462670e-06 2.04856052e-05 1.03918557e-05 1.05316813e-05
 3.49741854e-05 6.61497971e-06 9.67792071e-06 5.68270208e-06
 4.95309113e-06 5.57780095e-06 1.00900620e-05 8.35337305e-06
 7.16059860e-06 2.97144299e-05 6.82321514e-06 7.13278355e-06
 7.26202422e-06 1.12364025e-05 4.07769599e-06 4.34494325e-05
 8.34072557e-06 6.31385080e-06 1.99550176e-05 1.27728842e-05
 8.35589671e-06 1.79825495e-05 2.09139043e-05 1.51986955e-05
 2.31206732e-05 5.47239423e-06 4.56167338e-05 3.19782209e-05
 9.80447731e-06 2.98832220e-05 5.87863363e-05 1.27341976e-05
 2.88737242e-05 2.49832015e-05 2.83695652e-05 4.05403351e-03
 6.98382370e-05 1.01891932e-05 1.21993696e-04 2.57167833e-05
 4.89392299e-05 1.442416

In [61]:
# def get_pred(x):
#   if x >= 0.8:
#     return 1
#   return 0

### Threshold pred

In [110]:
y_te_pred = [[1 if pred>=0.08 else 0 for pred in row] for row in preds]

In [111]:
# y_te_pred_np = np.asarray(y_te_pred)

In [112]:
# print(y_te_pred_np[0:2])

In [119]:
preds.shape

(200000, 618)

### Top@ pred

In [155]:
top_k = 10

In [156]:
y_te_pre = np.zeros_like(preds)
print(y_te_pre.shape)

(200000, 618)


In [157]:
top_k_ind = [np.argpartition(x_in, -top_k)[-top_k:] for x_in in preds]
top_k_ind_arr = np.array(top_k_ind)

In [158]:
for i in range(y_te_pre.shape[0]):
  a = y_te_pre[i]
  b = top_k_ind_arr[i]
  a[b] = 1
  y_te_pre[i] = a 

In [113]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, roc_curve, confusion_matrix, roc_auc_score, recall_score, classification_report

### Threshold

In [114]:
roc_auc_score(y_te_true, preds, average='micro')

0.991059688892097

In [115]:
# f1_score(y_te_true, preds, average='micro')

In [116]:
f1_score(y_te_true, y_te_pred, average='micro')

0.5878786596096959

In [117]:
precision_score(y_te_true, y_te_pred, average='micro')

0.5216023435400976

In [118]:
recall_score(y_te_true, y_te_pred, average='micro')

0.673449063022201

### Top@

In [159]:
precision_score(y_te_true, y_te_pre, average='micro')

0.120666

In [160]:
recall_score(y_te_true, y_te_pre, average='micro')

0.9001700883265696

In [161]:
f1_score(y_te_true, y_te_pre, average='micro')

0.2128058071616016

In [66]:
classification_report(y_te_true, y_te_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           0       0.64      0.33      0.43        86\n           1       0.68      0.51      0.58        51\n           2       0.71      0.48      0.57       105\n           3       0.57      0.14      0.22        29\n           4       0.58      0.30      0.39        87\n           5       0.94      0.96      0.95       900\n           6       0.77      0.68      0.72        25\n           7       0.67      0.58      0.62       280\n           8       0.50      0.33      0.40         3\n           9       0.84      0.45      0.58        47\n          10       0.30      0.39      0.34       795\n          11       0.33      0.18      0.23        17\n          12       0.48      0.34      0.40        29\n          13       1.00      0.12      0.22         8\n          14       0.80      0.22      0.35        18\n          15       1.00      0.18      0.31        11\n          16       0.50      0.31      0.38        42\n       

In [58]:
# precision_score(y_te_true, preds, average='micro')

In [59]:
accuracy_score(y_te_true, y_te_pred) 

0.29377

In [79]:
# print(type(test[label_cols].values))
# print(type(y_te_pred))

In [80]:
# fpr, tpr, thresholds = roc_curve(y_te_true.argmax(axis=1), y_te_pred_np.argmax(axis=1))

In [81]:
type(label_cols[0])

int

In [None]:
print(classification_report(y_te_true, y_te_pred))