# Setting up

In [None]:
! pip install sklearn


In [None]:
# Define the current path
from pathlib import Path
root = Path.cwd() / 'data'


In [None]:
# Define the local path
txt_path = root / '1.0txt'
met_path = root / '3.0vec' / 'fast_met.tsv'
save_path = root / '3.0vec' / 'fast_met_lab.tsv'


d_list = ['cs.AI', 'econ.EM', 'eess.AS', 'math.AC', 'astro-ph', 'q-bio']


# Load meta

In [None]:
import pandas as pd
df_met = pd.read_table(met_path, names=['word'])
df_met

met_ls = list(df_met['word'])


# Load label

In [None]:
import pickle
import itertools


def getstr(path):
    '''get .pkl file, extract lists and return str of words'''
    with open(path, 'rb')as f:
        ls = pickle.load(f)
        ls = list(itertools.chain.from_iterable(ls))
        words = ' '.join(ls)
    return words


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd


def tfidf():
    """execute tfidf and return result as Data Flame"""
    docs = []
    for i in d_list:
        docs.append(getstr(txt_path / str(i+'.pkl')))

    # build a model
    vectorizer = TfidfVectorizer(smooth_idf=False)
    X = vectorizer.fit_transform(docs)

    # convert to the data frame
    values = X.toarray()
    feature_names = vectorizer.get_feature_names_out()
    global df_all
    df_all = pd.DataFrame(values.T, index=feature_names)
    return(df_all)


tfidf()


In [None]:
def lab():
    """Create a label for each words in metadata based on tfidf and return labeled meta as Data Flame"""
    df_lab = pd.DataFrame(
        {'word': met_ls, 'discipline': None}).set_index('word')

    t = 0.03
    err = 0

    for s in df_lab.index:
        if s in df_all.index:
            # general
            count = 0
            for i in range(len(d_list)):
                if df_all.loc[s, i] < t:
                    count += 1
            if count == len(d_list):
                df_lab.loc[s, 'discipline'] = 'general'

            else:
                max = 0
                maxdisp = 0
                for i in range(len(d_list)):
                    if df_all.loc[s, i] > max:
                        max = df_all.loc[s, i]
                        maxdisp = i
                df_lab.loc[s, 'discipline'] = d_list[maxdisp]

        else:
            err += 1
            df_lab.loc[s, 'discipline'] = 'general'
    print(err)

    return df_lab


# Combine Meta and Label

In [None]:
df_metlab = df_met.merge(lab(), how='left', on='word')
df_metlab


# Export as TSV

In [None]:
with open(save_path, 'w', newline='\n') as s:
    df_metlab.to_csv(s,
                     sep='\t',
                     index=False)
