In [9]:
import nltk, collections
import numpy as np
from nltk.collocations import *
from nltk.corpus import stopwords
import subprocess
import pandas as pd
import re
from sklearn.externals import joblib
import random

from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

import candidate_list_gen
import dataframe_generation
import Classifier
import math
import index_generator

In [10]:
import preprocessing
# This reads the *.tex sources and dumps them to a file
preprocessing.dump_detex_data()
feature_list_from_tex = ["indices","sections","subsections","large","underline","italicized","bold"]

Processed 90 source files and dumped data


In [11]:
    feature_dict = preprocessing.populate_properties()

In [12]:
df = pd.DataFrame([])
for filename in feature_dict:
    feature_dict[filename]["candidate_list"] = candidate_list_gen.process_text(feature_dict[filename]["plaintext"])
    df = df.append(dataframe_generation.generate_dataframe(filename,feature_dict[filename],feature_list_from_tex))


In [13]:
df = dataframe_generation.add_tf_idf(df)


In [14]:
#split data set into test, train, evaluation

test_files = ["dataset/discover_physics/ch02/ch02.tex","dataset/general_relativity/ch03/ch03.rbtex","dataset/general_relativity/ch05/ch05.rbtex","dataset/calculus/ch07/ch07.tex","dataset/fundamentals-of-calculus/ch05/ch05.rbtex","dataset/discover_physics/ch05/ch05.tex"]
evaluation_files = ["dataset/arxiv_0304140.tex"]
non_train_files = test_files + evaluation_files


df_test = pd.DataFrame([])
df_train = pd.DataFrame([])
df_evaluation = pd.DataFrame([])

df_test = df.loc[df.filename.isin(test_files),]

df_train = df.loc[~df.filename.isin(non_train_files),]
df_train = df_train.append([df_train[df_train.indices==1]]*100)

df_evaluation = df.loc[df.filename.isin(evaluation_files),]


In [15]:
#Train and test

feature_list = ['NN', 'NNP', 'NNS', 'VBG', 'VBD', 'VBN','VBZ', 'VBP', 'VB', 'CD', 'JJ', 'JJS', 'JJR', 'FW', 'NNPS','sections', 'subsections', 'large', 'underline','tf-idf','italicized', 'bold']


x_train = df_train[feature_list]
x_test = df_test[feature_list]

y_train = df_train["indices"]
y_test = df_test["indices"]


#rfc= RandomForestClassifier()
rfc =  MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

rfc.fit(x_train, y_train)

pred_test = rfc.predict(x_test)

df_test["pred"] = pred_test


In [16]:
#Evaluate

x_eval = df_evaluation[feature_list]
y_eval = df_evaluation["indices"]
pred_evaluation = rfc.predict(x_eval)
pred_prob = rfc.predict_proba(x_eval)

index_prob = [ele[1] for ele in pred_prob]

df_evaluation["pred"] = pred_evaluation
df_evaluation["pred_prob"] = index_prob
df_evaluation["score"] = df_evaluation["pred_prob"] + 0.1 * df_evaluation['NN'] +  0.1 * df_evaluation['NNP'] +  (df_evaluation["idf"])/4

In [17]:

df_pred_indices = df_evaluation[df_evaluation["pred"]==1]
print(df_pred_indices[["word","pred_prob"]].sort_values("pred_prob",ascending=False))

                 word  pred_prob
89748        monopole   0.994121
90087     topological   0.949830
89883           boson   0.910788
89902           d2dr2   0.900366
90240           Dirac   0.899275
89825        singular   0.883756
89977    perturbation   0.863646
89834          vector   0.861534
89727      eigenvalue   0.861235
90263        magnetic   0.850489
90257       anomalous   0.839227
90189   configuration   0.839041
90216    polarization   0.835293
90356          action   0.828216
90122            spin   0.823423
89715       potential   0.821985
90313          radial   0.821951
89762          dilute   0.814227
90065     interaction   0.813824
90279     integration   0.811695
90201        massless   0.810150
90004    contribution   0.808075
90315          moment   0.807337
90359     confinement   0.805660
89935  orthonormality   0.805660
89769           field   0.803188
90027       YangMills   0.802921
90247        equation   0.801200
89838        solution   0.801178
89714     

In [18]:

index_generator.index_gen(df_pred_indices["word"],evaluation_files[0],"index_"+ evaluation_files[0].split('/')[1])

