In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.model_selection import iterative_train_test_split

In [4]:
FILE_PATH = "/content/drive/MyDrive/Colab Notebooks/Arxiv Topic Classification/first 1 million/"

## Label Encoding of Y Data

In [11]:
df = pd.read_csv(FILE_PATH + "first 1 million.csv")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   categories  999999 non-null  object
 1   text        999999 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB


In [13]:
df.head()

Unnamed: 0,categories,text
0,hep-ph,calcul prompt diphoton product cross section t...
1,math.CO cs.CG,sparsitycertifi graph decomposit describ new a...
2,physics.gen-ph,evolut earthmoon system base dark matter field...
3,math.CO,determin stirl cycl number count unlabel acycl...
4,math.CA math.FA,dyadic lambdaalpha lambdaalpha paper show comp...


In [14]:
df['categories'] = df['categories'].str.split()

In [15]:
df.head()

Unnamed: 0,categories,text
0,[hep-ph],calcul prompt diphoton product cross section t...
1,"[math.CO, cs.CG]",sparsitycertifi graph decomposit describ new a...
2,[physics.gen-ph],evolut earthmoon system base dark matter field...
3,[math.CO],determin stirl cycl number count unlabel acycl...
4,"[math.CA, math.FA]",dyadic lambdaalpha lambdaalpha paper show comp...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999999 entries, 0 to 999998
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   categories  999999 non-null  object
 1   text        999999 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB


In [19]:
mlb = MultiLabelBinarizer()
label_matrix = mlb.fit(df["categories"])
label_matrix = mlb.transform(df["categories"])

In [20]:
labels_df = pd.DataFrame(label_matrix, columns = mlb.classes_)

In [26]:
labels_df.iloc[1]["math.CO"], labels_df.iloc[1]["cs.CG"], sum(labels_df.iloc[1])

(1, 1, 2)

In [27]:
df_with_labels = pd.concat([df, labels_df], axis = 1)
df_with_labels.head()

Unnamed: 0,categories,text,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,[hep-ph],calcul prompt diphoton product cross section t...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[math.CO, cs.CG]",sparsitycertifi graph decomposit describ new a...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,[physics.gen-ph],evolut earthmoon system base dark matter field...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,[math.CO],determin stirl cycl number count unlabel acycl...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[math.CA, math.FA]",dyadic lambdaalpha lambdaalpha paper show comp...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df_with_labels.drop(["categories"], axis = 1, inplace = True)

In [29]:
df_with_labels.head()

Unnamed: 0,text,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,calcul prompt diphoton product cross section t...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,sparsitycertifi graph decomposit describ new a...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,evolut earthmoon system base dark matter field...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,determin stirl cycl number count unlabel acycl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dyadic lambdaalpha lambdaalpha paper show comp...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df_with_labels.to_csv(FILE_PATH + "df_with_labels.csv", index = False)

## Train Test Split

In [5]:
df_with_labels = pd.read_csv(FILE_PATH + "df_with_labels.csv")
df_with_labels.head()

Unnamed: 0,text,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,calcul prompt diphoton product cross section t...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,sparsitycertifi graph decomposit describ new a...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,evolut earthmoon system base dark matter field...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,determin stirl cycl number count unlabel acycl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dyadic lambdaalpha lambdaalpha paper show comp...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# remove the "text" from the label_name
label_name = list(df_with_labels.columns)
label_name.pop(0)
print(label_name)

['astro-ph', 'astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE', 'astro-ph.IM', 'astro-ph.SR', 'cond-mat.dis-nn', 'cond-mat.mes-hall', 'cond-mat.mtrl-sci', 'cond-mat.other', 'cond-mat.quant-gas', 'cond-mat.soft', 'cond-mat.stat-mech', 'cond-mat.str-el', 'cond-mat.supr-con', 'cs.AI', 'cs.AR', 'cs.CC', 'cs.CE', 'cs.CG', 'cs.CL', 'cs.CR', 'cs.CV', 'cs.CY', 'cs.DB', 'cs.DC', 'cs.DL', 'cs.DM', 'cs.DS', 'cs.ET', 'cs.FL', 'cs.GL', 'cs.GR', 'cs.GT', 'cs.HC', 'cs.IR', 'cs.IT', 'cs.LG', 'cs.LO', 'cs.MA', 'cs.MM', 'cs.MS', 'cs.NA', 'cs.NE', 'cs.NI', 'cs.OH', 'cs.OS', 'cs.PF', 'cs.PL', 'cs.RO', 'cs.SC', 'cs.SD', 'cs.SE', 'cs.SI', 'cs.SY', 'econ.EM', 'econ.GN', 'econ.TH', 'eess.AS', 'eess.IV', 'eess.SP', 'eess.SY', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', 'hep-th', 'math-ph', 'math.AC', 'math.AG', 'math.AP', 'math.AT', 'math.CA', 'math.CO', 'math.CT', 'math.CV', 'math.DG', 'math.DS', 'math.FA', 'math.GM', 'math.GN', 'math.GR', 'math.GT', 'math.HO', 'math.IT', 'math.KT', 'math.LO', 'math.MG',

In [7]:
X = df_with_labels["text"]
Y = df_with_labels[label_name]

In [21]:
pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [22]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import indexable, _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split
from itertools import chain

def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from:
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size,train_size=train_size,
                                random_state=random_state, stratify=None, shuffle=shuffle)

    assert shuffle, "Stratified train/test split is not implemented for shuffle=False"

    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

In [24]:
%%time
X_train, y_train, X_test, y_test = multilabel_train_test_split(X,Y,stratify=Y, test_size=0.05)

CPU times: user 58.5 s, sys: 5.62 s, total: 1min 4s
Wall time: 1min 5s


In [25]:
X_train

0         calcul prompt diphoton product cross section t...
1         sparsitycertifi graph decomposit describ new a...
2         evolut earthmoon system base dark matter field...
3         determin stirl cycl number count unlabel acycl...
4         dyadic lambdaalpha lambdaalpha paper show comp...
                                ...                        
999994    dynam coupl dilut magnet impur quantum spin li...
999995    recognis cardiac abnorm wearabl devic photople...
999996    evolut skyrmion crystal fe co si like quasi tw...
999997    benedick amrein berthier type theorem relat tw...
999998    constraint scalar tensor model gauss bonnet co...
Name: text, Length: 949875, dtype: object

In [26]:
y_train

31        probe nonstandard neutrino interact supernova ...
38        scalar radiu pion zero form factor quadrat pio...
41        gener system theori likequantum semant fuzzi s...
44        evolut solitari wave undular bore shallowwat f...
124       anisotrop thermoelast part unifi approach note...
                                ...                        
999935    degre bowen factor inject code diffeomorph sho...
999940    weak converg sequenc homogen young measur asso...
999943    probe dzyaloshinskii moriya interact via propa...
999964    challeng reconcil observ theori brightest high...
999984    simplifi beth salpet descript basic pseudoscal...
Name: text, Length: 50124, dtype: object

In [27]:
X_test

Unnamed: 0,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,cond-mat.mtrl-sci,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999994,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999996,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
999997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
y_test

Unnamed: 0,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,cond-mat.mtrl-sci,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
44,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999943,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
999964,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((949875,), (949875, 156), (50124,), (50124, 156))

In [31]:
train_data = pd.concat([X_train, X_test], axis = 1)
train_data.head()

Unnamed: 0,text,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,calcul prompt diphoton product cross section t...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,sparsitycertifi graph decomposit describ new a...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,evolut earthmoon system base dark matter field...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,determin stirl cycl number count unlabel acycl...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,dyadic lambdaalpha lambdaalpha paper show comp...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
test_data = pd.concat([y_train, y_test], axis = 1)
test_data.head()

Unnamed: 0,text,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,...,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
31,probe nonstandard neutrino interact supernova ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,scalar radiu pion zero form factor quadrat pio...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,gener system theori likequantum semant fuzzi s...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
44,evolut solitari wave undular bore shallowwat f...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124,anisotrop thermoelast part unifi approach note...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
train_data.to_csv(FILE_PATH + "train_data.csv", index = False)

In [34]:
test_data.to_csv(FILE_PATH + "test_data.csv", index = False)