# arXiv data subset - tfidf

In [1]:
import os
import sys
parent_dr = os.path.split(os.getcwd())[0]
if parent_dr not in sys.path:
    sys.path.append(parent_dr)

In [2]:
import scipy.sparse as sp
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from core.data.arxiv_data_io import *
from core.data.generate_tf_idf import *

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

Pickle file can be found:
https://drive.google.com/drive/folders/1CQx3y_9vWHt9Zi9aJWp9lK0W__9T7zTY?usp=sharing

In [4]:
file_name = "tokenized_arxiv_subset_15540.pkl"
full_path = os.path.join(parent_dr, "core", "resources", file_name)
data_df = pd.read_pickle(full_path)

In [5]:
data_df.head(n=3)

Unnamed: 0,id,authors,title,categories,abstract,update_dt,clean,tokens
0,704.0648,Kaushik Majumdar,Behavioral response to strong aversive stimuli: A neurodynamical model,q-bio.NC,"In this paper a theoretical model of functioning of a neural circuit during a\nbehavioral response has been proposed. A neural circuit can be thought of as a\ndirected multigraph whose each vertex is a neuron and each edge is a synapse.\nIt has been assumed in this paper that the behavior of such circuits is\nmanifested through the collective behavior of neurons belonging to that\ncircuit. Behavioral information of each neuron is contained in the coefficients\nof the fast Fourier transform (FFT) over the output spike train. Those\ncoefficients form a vector in a multidimensional vector space. Behavioral\ndynamics of a neuronal network in response to strong aversive stimuli has been\nstudied in a vector space in which a suitable pseudometric has been defined.\nThe neurodynamical model of network behavior has been formulated in terms of\nexisting memory, synaptic plasticity and feelings. The model has an analogy in\nclassical electrostatics, by which the notion of force and potential energy has\nbeen introduced. Since the model takes input from each neuron in a network and\nproduces a behavior as the output, it would be extremely difficult or may even\nbe impossible to implement. But with the help of the model a possible\nexplanation for an hitherto unexplained neurological observation in human brain\nhas been offered. The model is compatible with a recent model of sequential\nbehavioral dynamics. The model is based on electrophysiology, but its relevance\nto hemodynamics has been outlined.\n",2007-05-23,in this paper a theoretical model of functioning of a neural circuit during a behavioral response has been proposed a neural circuit can be thought of as a directed multigraph whose each vertex is a neuron and each edge is a synapse it has been assumed in this paper that the behavior of such circuits is manifested through the collective behavior of neurons belonging to that circuit behavioral information of each neuron is contained in the coefficients of the fast fourier transform fft over the output spike train those coefficients form a vector in a multidimensional vector space behavioral dynamics of a neuronal network in response to strong aversive stimuli has been studied in a vector space in which a suitable pseudometric has been defined the neurodynamical model of network behavior has been formulated in terms of existing memory synaptic plasticity and feelings the model has an analogy in classical electrostatics by which the notion of force and potential energy has been introduced since the model takes input from each neuron in a network and produces a behavior as the output it would be extremely difficult or may even be impossible to implement but with the help of the model a possible explanation for an hitherto unexplained neurological observation in human brain has been offered the model is compatible with a recent model of sequential behavioral dynamics the model is based on electrophysiology but its relevance to hemodynamics has been outlined,"[paper, theoretical, model, functioning, neural, circuit, behavioral, response, propose, neural, circuit, think, direct, multigraph, vertex, neuron, edge, synapse, assume, paper, behavior, circuit, manifest, collective, behavior, neuron, belong, circuit, behavioral, information, neuron, contain, coefficient, fast, fourier, transform, fft, output, spike, train, coefficient, form, vector, multidimensional, vector, space, behavioral, dynamic, neuronal, network, response, strong, aversive, stimulus, study, vector, space, suitable, pseudometric, define, neurodynamical, model, network, behavior, formulate, term, exist, memory, synaptic, plasticity, feeling, model, analogy, classical, electrostatic, notion, force, potential, energy, introduce, model, take, input, neuron, network, produce, behavior, output, extremely, difficult, impossible, implement, help, model, possible, explanation, hitherto, unexplained, neurological, observation, ...]"
1,704.1394,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif Andersen",Calculating Valid Domains for BDD-Based Interactive Configuration,cs.AI,In these notes we formally describe the functionality of Calculating Valid\nDomains from the BDD representing the solution space of valid configurations.\nThe formalization is largely based on the CLab configuration framework.\n,2007-05-23,in these notes we formally describe the functionality of calculating valid domains from the bdd representing the solution space of valid configurations the formalization is largely based on the clab configuration framework,"[note, formally, describe, functionality, calculate, valid, domain, bdd, represent, solution, space, valid, configuration, formalization, largely, base, clab, configuration, framework]"
2,704.1829,"Stefan Felsner, Kamil Kloch, Grzegorz Matecki, and Piotr Micek",On-line Chain Partitions of Up-growing Semi-orders,cs.DM,"On-line chain partition is a two-player game between Spoiler and Algorithm.\nSpoiler presents a partially ordered set, point by point. Algorithm assigns\nincoming points (immediately and irrevocably) to the chains which constitute a\nchain partition of the order. The value of the game for orders of width $w$ is\na minimum number $\fVal(w)$ such that Algorithm has a strategy using at most\n$\fVal(w)$ chains on orders of width at most $w$. We analyze the chain\npartition game for up-growing semi-orders. Surprisingly, the golden ratio comes\ninto play and the value of the game is $\lfloor\frac{1+\sqrt{5}}{2}\; w\n\rfloor$.\n",2011-02-22,on line chain partition is a two player game between spoiler and algorithm spoiler presents a partially ordered set point by point algorithm assigns incoming points immediately and irrevocably to the chains which constitute a chain partition of the order the value of the game for orders of width w is a minimum number fval w such that algorithm has a strategy using at most fval w chains on orders of width at most w we analyze the chain partition game for up growing semi orders surprisingly the golden ratio comes into play and the value of the game is lfloor frac number sqrt number number w rfloor,"[line, chain, partition, player, game, spoiler, algorithm, spoiler, present, partially, order, set, point, point, algorithm, assign, incoming, point, immediately, irrevocably, chain, constitute, chain, partition, order, value, game, order, width, minimum, number, fval, algorithm, strategy, fval, chain, order, width, analyze, chain, partition, game, grow, semi, order, surprisingly, golden, ratio, come, play, value, game, lfloor, frac, number, sqrt, number, number, rfloor]"


In [6]:
train_df, _ = sample_arxiv_data_by_category(data_df)
assert len(train_df)/len(data_df) == 0.8


In [7]:
train_df.head()


Unnamed: 0,id,authors,title,categories,abstract,update_dt,clean,tokens,full_df_index
0,803.0159,"V.R. Vemula, David Ball, Simon Thorne",Towards a Spreadsheet Engineering,cs.CY,"In this paper, we report some on-going focused research, but are further keen\nto set it in the context of a proposed bigger picture, as follows. There is a\ncertain depressing pattern about the attitude of industry to spreadsheet error\nresearch and a certain pattern about conferences highlighting these issues. Is\nit not high time to move on from measuring spreadsheet errors to developing an\narmoury of disciplines and controls? In short, we propose the need to\nrigorously lay the foundations of a spreadsheet engineering discipline.\nClearly, multiple research teams would be required to tackle such a big task.\nThis suggests the need for both national and international collaborative\nresearch, since any given group can only address a small segment of the whole.\nThere are already a small number of examples of such on-going international\ncollaborative research. Having established the need for a directed research\neffort, the rest of the paper then attempts to act as an exemplar in\ndemonstrating and applying this focus. With regard to one such of research, in\na recent paper, Panko (2005) stated that: ""...group development and testing\nappear to be promising areas to pursue"". Of particular interest to us are some\ngaps in the published research record on techniques to reduce errors. We\nfurther report on the topics: techniques for cross-checking, time constraints\neffects, and some aspects of developer perception.\n",2008-03-10,in this paper we report some on going focused research but are further keen to set it in the context of a proposed bigger picture as follows there is a certain depressing pattern about the attitude of industry to spreadsheet error research and a certain pattern about conferences highlighting these issues is it not high time to move on from measuring spreadsheet errors to developing an armoury of disciplines and controls in short we propose the need to rigorously lay the foundations of a spreadsheet engineering discipline clearly multiple research teams would be required to tackle such a big task this suggests the need for both national and international collaborative research since any given group can only address a small segment of the whole there are already a small number of examples of such on going international collaborative research having established the need for a directed research effort the rest of the paper then attempts to act as an exemplar in demonstrating and applying this focus with regard to one such of research in a recent paper panko number stated that group development and testing appear to be promising areas to pursue of particular interest to us are some gaps in the published research record on techniques to reduce errors we further report on the topics techniques for cross checking time constraints effects and some aspects of developer perception,"[paper, report, go, focus, research, keen, set, context, propose, big, picture, follow, certain, depressing, pattern, attitude, industry, spreadsheet, error, research, certain, pattern, conference, highlight, issue, high, time, measure, spreadsheet, error, develop, armoury, discipline, control, short, propose, need, rigorously, lay, foundation, spreadsheet, engineering, discipline, clearly, multiple, research, team, require, tackle, big, task, suggest, need, national, international, collaborative, research, give, group, address, small, segment, small, number, example, go, international, collaborative, research, have, establish, need, direct, research, effort, rest, paper, attempt, act, exemplar, demonstrate, apply, focus, regard, research, recent, paper, panko, number, state, group, development, testing, appear, promise, area, pursue, particular, interest, gap, ...]",184
1,1109.4554,Paul Bonsma,Surface Split Decompositions and Subgraph Isomorphism in Graphs on\n Surfaces,cs.DM,"The Subgraph Isomorphism problem asks, given a host graph G on n vertices and\na pattern graph P on k vertices, whether G contains a subgraph isomorphic to P.\nThe restriction of this problem to planar graphs has often been considered.\nAfter a sequence of improvements, the current best algorithm for planar graphs\nis a linear time algorithm by Dorn (STACS '10), with complexity $2^{O(k)}\nO(n)$.\n We generalize this result, by giving an algorithm of the same complexity for\ngraphs that can be embedded in surfaces of bounded genus. At the same time, we\nsimplify the algorithm and analysis. The key to these improvements is the\nintroduction of surface split decompositions for bounded genus graphs, which\ngeneralize sphere cut decompositions for planar graphs. We extend the algorithm\nfor the problem of counting and generating all subgraphs isomorphic to P, even\nfor the case where P is disconnected. This answers an open question by Eppstein\n(SODA '95 / JGAA '99).\n",2015-03-19,the subgraph isomorphism problem asks given a host graph g on n vertices and a pattern graph p on k vertices whether g contains a subgraph isomorphic to p the restriction of this problem to planar graphs has often been considered after a sequence of improvements the current best algorithm for planar graphs is a linear time algorithm by dorn stacs number with complexity number o k o n we generalize this result by giving an algorithm of the same complexity for graphs that can be embedded in surfaces of bounded genus at the same time we simplify the algorithm and analysis the key to these improvements is the introduction of surface split decompositions for bounded genus graphs which generalize sphere cut decompositions for planar graphs we extend the algorithm for the problem of counting and generating all subgraphs isomorphic to p even for the case where p is disconnected this answers an open question by eppstein soda number jgaa number,"[subgraph, isomorphism, problem, ask, give, host, graph, vertex, pattern, graph, vertex, contain, subgraph, isomorphic, restriction, problem, planar, graph, consider, sequence, improvement, current, good, algorithm, planar, graph, linear, time, algorithm, dorn, stac, number, complexity, number, generalize, result, give, algorithm, complexity, graph, embed, surface, bounded, genus, time, simplify, algorithm, analysis, key, improvement, introduction, surface, split, decomposition, bounded, genus, graph, generalize, sphere, cut, decomposition, planar, graph, extend, algorithm, problem, count, generate, subgraphs, isomorphic, case, disconnect, answer, open, question, eppstein, soda, number, jgaa, number]",1417
2,1104.325,"Salah Rifai, Xavier Glorot, Yoshua Bengio, Pascal Vincent",Adding noise to the input of a model trained with a regularized\n objective,cs.AI,"Regularization is a well studied problem in the context of neural networks.\nIt is usually used to improve the generalization performance when the number of\ninput samples is relatively small or heavily contaminated with noise. The\nregularization of a parametric model can be achieved in different manners some\nof which are early stopping (Morgan and Bourlard, 1990), weight decay, output\nsmoothing that are used to avoid overfitting during the training of the\nconsidered model. From a Bayesian point of view, many regularization techniques\ncorrespond to imposing certain prior distributions on model parameters (Krogh\nand Hertz, 1991). Using Bishop's approximation (Bishop, 1995) of the objective\nfunction when a restricted type of noise is added to the input of a parametric\nfunction, we derive the higher order terms of the Taylor expansion and analyze\nthe coefficients of the regularization terms induced by the noisy input. In\nparticular we study the effect of penalizing the Hessian of the mapping\nfunction with respect to the input in terms of generalization performance. We\nalso show how we can control independently this coefficient by explicitly\npenalizing the Jacobian of the mapping function on corrupted inputs.\n",2011-04-19,regularization is a well studied problem in the context of neural networks it is usually used to improve the generalization performance when the number of input samples is relatively small or heavily contaminated with noise the regularization of a parametric model can be achieved in different manners some of which are early stopping morgan and bourlard number weight decay output smoothing that are used to avoid overfitting during the training of the considered model from a bayesian point of view many regularization techniques correspond to imposing certain prior distributions on model parameters krogh and hertz number using bishop s approximation bishop number of the objective function when a restricted type of noise is added to the input of a parametric function we derive the higher order terms of the taylor expansion and analyze the coefficients of the regularization terms induced by the noisy input in particular we study the effect of penalizing the hessian of the mapping function with respect to the input in terms of generalization performance we also show how we can control independently this coefficient by explicitly penalizing the jacobian of the mapping function on corrupted inputs,"[regularization, study, problem, context, neural, network, usually, improve, generalization, performance, number, input, sample, relatively, small, heavily, contaminate, noise, regularization, parametric, model, achieve, different, manner, early, stop, morgan, bourlard, number, weight, decay, output, smoothing, avoid, overfitting, training, considered, model, bayesian, point, view, regularization, technique, correspond, impose, certain, prior, distribution, model, parameter, krogh, hertz, number, bishop, approximation, bishop, number, objective, function, restricted, type, noise, add, input, parametric, function, derive, high, order, term, taylor, expansion, analyze, coefficient, regularization, term, induce, noisy, input, particular, study, effect, penalize, hessian, mapping, function, respect, input, term, generalization, performance, control, independently, coefficient, explicitly, penalize, jacobian, mapping, function, corrupted, ...]",1083
3,1502.0424,"H. Furma\'nczyk, M. Kubale",Scheduling of unit-length jobs with cubic incompatibility graphs on\n three uniform machines,cs.DM,"In the paper we consider the problem of scheduling $n$ identical jobs on 3\nuniform machines with speeds $s_1, s_2,$ and $s_3$ to minimize the schedule\nlength. We assume that jobs are subjected to some kind of mutual exclusion\nconstraints, modeled by a cubic incompatibility graph. We show that if the\ngraph is 2-chromatic then the problem can be solved in $O(n^2)$ time. If the\ngraph is 3-chromatic, the problem becomes NP-hard even if $s_1>s_2=s_3$.\nHowever, in this case there exists a $4/3$-approximation algorithm running in\n$O(n^3)$ time. Moreover, this algorithm solves the problem almost surely to\noptimality if $3s_1/4 \leq s_2 = s_3$.\n",2015-06-17,in the paper we consider the problem of scheduling n identical jobs on number uniform machines with speeds s_number s_number and s_number to minimize the schedule length we assume that jobs are subjected to some kind of mutual exclusion constraints modeled by a cubic incompatibility graph we show that if the graph is number chromatic then the problem can be solved in o n number time if the graph is number chromatic the problem becomes np hard even if s_number s_number s_number however in this case there exists a number number approximation algorithm running in o n number time moreover this algorithm solves the problem almost surely to optimality if numbers_number number leq s_number s_number,"[paper, consider, problem, scheduling, identical, job, number, uniform, machine, speed, minimize, schedule, length, assume, job, subject, kind, mutual, exclusion, constraint, model, cubic, incompatibility, graph, graph, number, chromatic, problem, solve, number, time, graph, number, chromatic, problem, np, hard, case, exist, number, number, approximation, algorithm, run, number, time, algorithm, solve, problem, surely, optimality, number, leq]",4652
4,2003.00749,"David Tuckey, Alessandra Russo, Krysia Broda",A general framework for scientifically inspired explanations in AI,cs.AI,"Explainability in AI is gaining attention in the computer science community\nin response to the increasing success of deep learning and the important need\nof justifying how such systems make predictions in life-critical applications.\nThe focus of explainability in AI has predominantly been on trying to gain\ninsights into how machine learning systems function by exploring relationships\nbetween input data and predicted outcomes or by extracting simpler\ninterpretable models. Through literature surveys of philosophy and social\nscience, authors have highlighted the sharp difference between these generated\nexplanations and human-made explanations and claimed that current explanations\nin AI do not take into account the complexity of human interaction to allow for\neffective information passing to not-expert users. In this paper we instantiate\nthe concept of structure of scientific explanation as the theoretical\nunderpinning for a general framework in which explanations for AI systems can\nbe implemented. This framework aims to provide the tools to build a\n""mental-model"" of any AI system so that the interaction with the user can\nprovide information on demand and be closer to the nature of human-made\nexplanations. We illustrate how we can utilize this framework through two very\ndifferent examples: an artificial neural network and a Prolog solver and we\nprovide a possible implementation for both examples.\n",2020-03-03,explainability in ai is gaining attention in the computer science community in response to the increasing success of deep learning and the important need of justifying how such systems make predictions in life critical applications the focus of explainability in ai has predominantly been on trying to gain insights into how machine learning systems function by exploring relationships between input data and predicted outcomes or by extracting simpler interpretable models through literature surveys of philosophy and social science authors have highlighted the sharp difference between these generated explanations and human made explanations and claimed that current explanations in ai do not take into account the complexity of human interaction to allow for effective information passing to not expert users in this paper we instantiate the concept of structure of scientific explanation as the theoretical underpinning for a general framework in which explanations for ai systems can be implemented this framework aims to provide the tools to build a mental model of any ai system so that the interaction with the user can provide information on demand and be closer to the nature of human made explanations we illustrate how we can utilize this framework through two very different examples an artificial neural network and a prolog solver and we provide a possible implementation for both examples,"[explainability, ai, gain, attention, computer, science, community, response, increase, success, deep, learning, important, need, justify, system, prediction, life, critical, application, focus, explainability, ai, predominantly, try, gain, insight, machine, learn, system, function, explore, relationship, input, datum, predict, outcome, extract, simple, interpretable, model, literature, survey, philosophy, social, science, author, highlight, sharp, difference, generate, explanation, human, explanation, claim, current, explanation, ai, account, complexity, human, interaction, allow, effective, information, pass, expert, user, paper, instantiate, concept, structure, scientific, explanation, theoretical, underpinning, general, framework, explanation, ai, system, implement, framework, aim, provide, tool, build, mental, model, ai, system, interaction, user, provide, information, demand, close, nature, human, explanation, ...]",11107


In [8]:
print(data_df.shape)
train_df.columns

(15540, 8)


Index(['id', 'authors', 'title', 'categories', 'abstract', 'update_dt',
       'clean', 'tokens', 'full_df_index'],
      dtype='object')

## Create a tfidf vectorizer with dummy tokenizer
### We've already tokenized and saved the results, so don't do it again

In [9]:
def dummy_tokenizer(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_tokenizer,
    preprocessor=dummy_tokenizer,
    token_pattern=None)  

In [10]:
tfidf.fit(train_df["tokens"])

TfidfVectorizer(preprocessor=<function dummy_tokenizer at 0x168572040>,
                token_pattern=None,
                tokenizer=<function dummy_tokenizer at 0x168572040>)

In [11]:
print("vocab size for corpus: ", len(tfidf.vocabulary_))

vocab size for corpus:  30460


In [12]:
print(list(tfidf.vocabulary_.items())[0:100])

[('paper', 19716), ('report', 23033), ('go', 11034), ('focus', 10078), ('research', 23106), ('keen', 14375), ('set', 24549), ('context', 5341), ('propose', 21656), ('big', 2642), ('picture', 20526), ('follow', 10108), ('certain', 3965), ('depressing', 6800), ('pattern', 19940), ('attitude', 1868), ('industry', 13090), ('spreadsheet', 25634), ('error', 8950), ('conference', 5108), ('highlight', 11884), ('issue', 13965), ('high', 11879), ('time', 27456), ('measure', 16221), ('develop', 6947), ('armoury', 1572), ('discipline', 7269), ('control', 5400), ('short', 24729), ('need', 17869), ('rigorously', 23464), ('lay', 14849), ('foundation', 10248), ('engineering', 8649), ('clearly', 4393), ('multiple', 17498), ('team', 27001), ('require', 23083), ('tackle', 26807), ('task', 26922), ('suggest', 26371), ('national', 17775), ('international', 13577), ('collaborative', 4675), ('give', 10922), ('group', 11289), ('address', 336), ('small', 25108), ('segment', 24348), ('number', 18589), ('example

In [13]:
index_to_word = {v:k for k,v in tfidf.vocabulary_.items()}
assert len(index_to_word) == len(tfidf.vocabulary_)

In [14]:
index_to_word[6947]

'develop'

### Example of applying to an article

In [15]:
example_index = 140

In [16]:
print(train_df["categories"][example_index])
print("\n")
print(train_df["abstract"][example_index])

cs.CY


  An undergraduate compilers course poses significant challenges to students,
in both the conceptual richness of the major components and in the programming
effort necessary to implement them. In this paper, I argue that a related
architecture, the interpreter, serves as an effective conceptual framework in
which to teach some of the later stages of the compiler pipeline. This
framework can serve both to unify some of the major concepts that are taught in
a typical undergraduate course and to structure the implementation of a
semester-long compiler project.



In [17]:
article = train_df["tokens"][example_index] # some article
article_vector = tfidf.transform([article])

In [18]:
print("type: ", type(article_vector))
print("shape: ", article_vector.shape)

type:  <class 'scipy.sparse.csr.csr_matrix'>
shape:  (1, 30460)


In [19]:
article_vector

<1x30460 sparse matrix of type '<class 'numpy.float64'>'
	with 35 stored elements in Compressed Sparse Row format>

### This is a sparse data representation.
#### You can see the dense values by:

In [20]:
article_vector.data

array([0.13126994, 0.29570113, 0.12414162, 0.25573269, 0.09767782,
       0.07352975, 0.10870997, 0.08726139, 0.2191345 , 0.1669463 ,
       0.1711446 , 0.12042176, 0.09975909, 0.09404713, 0.1238445 ,
       0.14296485, 0.04324594, 0.10256871, 0.19951817, 0.08924309,
       0.13408423, 0.19531875, 0.09101789, 0.08631657, 0.13817846,
       0.10256871, 0.08840448, 0.21621191, 0.23270206, 0.08195492,
       0.09487108, 0.54319258, 0.07692218, 0.09996748, 0.09404713])

In [21]:
article_vector

<1x30460 sparse matrix of type '<class 'numpy.float64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [22]:
article_vector.indices

array([28628, 28484, 28216, 26998, 26089, 26074, 25742, 24832, 24540,
       24405, 23430, 22848, 21573, 21562, 20999, 20584, 19716, 17859,
       15765, 15445, 14808, 13607, 12802, 12796, 10312,  8216,  8196,
        5634,  5035,  5030,  4944,  4910,  4024,  1528,  1497],
      dtype=int32)

In [23]:
article_words = [[index_to_word[feature_index],  article_vector.data[i]]
                  for i, feature_index in enumerate(article_vector.indices)]
article_words

[['unify', 0.13126994119063212],
 ['undergraduate', 0.29570113267907233],
 ['typical', 0.12414162425578093],
 ['teach', 0.2557326933891453],
 ['student', 0.09767782219634011],
 ['structure', 0.07352974842061219],
 ['stage', 0.10870997160999991],
 ['significant', 0.08726139349328559],
 ['serve', 0.21913449578781913],
 ['semester', 0.16694629809063208],
 ['richness', 0.1711445981212223],
 ['related', 0.12042175704688958],
 ['project', 0.09975908682955043],
 ['programming', 0.09404712938450525],
 ['pose', 0.12384449885892768],
 ['pipeline', 0.14296484514872537],
 ['paper', 0.04324594305320362],
 ['necessary', 0.10256871274372505],
 ['major', 0.19951817365910085],
 ['long', 0.08924308614722334],
 ['later', 0.13408423424641575],
 ['interpreter', 0.1953187459011147],
 ['implementation', 0.09101788829155769],
 ['implement', 0.0863165676758837],
 ['framework', 0.13817846317370835],
 ['effort', 0.10256871274372505],
 ['effective', 0.0884044789507167],
 ['course', 0.21621190549759856],
 ['concep

### Replicating everything above with core functions on the entire train_df corpus

In [24]:
#fit the tfid object
train_tokens = train_df['tokens']
core_tfid, index_to_word = fit_tfid(train_df['tokens'])
print(f'core_tfid is a {type(core_tfid)} \n with a vocabulary of {len(index_to_word)} words')

core_tfid is a <class 'sklearn.feature_extraction.text.TfidfVectorizer'> 
 with a vocabulary of 30460 words


index_to_word is a dict that maps an index number to every unique token in the corpora we fit on

In [25]:
#random index
index_to_word[6947]

'develop'

In [26]:
core_matrix, core_index_to_doc = transform_tfid(train_df,tfidf_obj=core_tfid)

In [27]:
core_matrix

<12432x30460 sparse matrix of type '<class 'numpy.float64'>'
	with 773819 stored elements in Compressed Sparse Row format>

In [28]:
print(f'The TF-IDF matrix has dimensions: {core_matrix.shape}')

The TF-IDF matrix has dimensions: (12432, 30460)


core_index_to_doc is a dict that maps tfidf matrix row indices to document ids from train_df

In [29]:
#at random pick row 140 in sparse matrix
core_index_to_doc[140]

'1412.0426'

In [30]:
single_doc_id = core_index_to_doc[140]
train_df[train_df['id'] == single_doc_id]

Unnamed: 0,id,authors,title,categories,abstract,update_dt,clean,tokens,full_df_index
140,1412.0426,John H. E. Lasseter,The Interpreter In An Undergraduate Compilers Course,cs.CY,"An undergraduate compilers course poses significant challenges to students,\nin both the conceptual richness of the major components and in the programming\neffort necessary to implement them. In this paper, I argue that a related\narchitecture, the interpreter, serves as an effective conceptual framework in\nwhich to teach some of the later stages of the compiler pipeline. This\nframework can serve both to unify some of the major concepts that are taught in\na typical undergraduate course and to structure the implementation of a\nsemester-long compiler project.\n",2014-12-02,an undergraduate compilers course poses significant challenges to students in both the conceptual richness of the major components and in the programming effort necessary to implement them in this paper i argue that a related architecture the interpreter serves as an effective conceptual framework in which to teach some of the later stages of the compiler pipeline this framework can serve both to unify some of the major concepts that are taught in a typical undergraduate course and to structure the implementation of a semester long compiler project,"[undergraduate, compiler, course, pose, significant, challenge, student, conceptual, richness, major, component, programming, effort, necessary, implement, paper, argue, related, architecture, interpreter, serve, effective, conceptual, framework, teach, later, stage, compiler, pipeline, framework, serve, unify, major, concept, teach, typical, undergraduate, course, structure, implementation, semester, long, compiler, project]",4483


We can get the dense values for this row with:

In [31]:
single_doc = core_matrix.getrow(140)
single_doc.data

array([0.13126994, 0.29570113, 0.12414162, 0.25573269, 0.09767782,
       0.07352975, 0.10870997, 0.08726139, 0.2191345 , 0.1669463 ,
       0.1711446 , 0.12042176, 0.09975909, 0.09404713, 0.1238445 ,
       0.14296485, 0.04324594, 0.10256871, 0.19951817, 0.08924309,
       0.13408423, 0.19531875, 0.09101789, 0.08631657, 0.13817846,
       0.10256871, 0.08840448, 0.21621191, 0.23270206, 0.08195492,
       0.09487108, 0.54319258, 0.07692218, 0.09996748, 0.09404713])

And the indices of these dense values with:

In [32]:
single_doc.indices

array([28628, 28484, 28216, 26998, 26089, 26074, 25742, 24832, 24540,
       24405, 23430, 22848, 21573, 21562, 20999, 20584, 19716, 17859,
       15765, 15445, 14808, 13607, 12802, 12796, 10312,  8216,  8196,
        5634,  5035,  5030,  4944,  4910,  4024,  1528,  1497],
      dtype=int32)

In [33]:
single_doc_words = [[index_to_word[feature_index],  single_doc.data[i]]
                    for i, feature_index in enumerate(single_doc.indices)]
single_doc_words

[['unify', 0.13126994119063212],
 ['undergraduate', 0.29570113267907233],
 ['typical', 0.12414162425578093],
 ['teach', 0.2557326933891453],
 ['student', 0.09767782219634011],
 ['structure', 0.07352974842061219],
 ['stage', 0.10870997160999991],
 ['significant', 0.08726139349328559],
 ['serve', 0.21913449578781913],
 ['semester', 0.16694629809063208],
 ['richness', 0.1711445981212223],
 ['related', 0.12042175704688958],
 ['project', 0.09975908682955043],
 ['programming', 0.09404712938450525],
 ['pose', 0.12384449885892768],
 ['pipeline', 0.14296484514872537],
 ['paper', 0.04324594305320362],
 ['necessary', 0.10256871274372505],
 ['major', 0.19951817365910085],
 ['long', 0.08924308614722334],
 ['later', 0.13408423424641575],
 ['interpreter', 0.1953187459011147],
 ['implementation', 0.09101788829155769],
 ['implement', 0.0863165676758837],
 ['framework', 0.13817846317370835],
 ['effort', 0.10256871274372505],
 ['effective', 0.0884044789507167],
 ['course', 0.21621190549759856],
 ['concep

In [34]:
#TODO use ,toarray() method to convert to dense matrix