In [None]:
import nltk, collections
import numpy as np
from nltk.collocations import *
from nltk.corpus import stopwords
import subprocess
import pandas as pd
import re
from sklearn.externals import joblib
import random

from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier

import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
import preprocessing
import text_processing
# This reads the *.tex sources and dumps them to a file
preprocessing.dump_detex_data()

Processed 88 source files and dumped data


In [3]:
feature_dict = preprocessing.populate_properties()
feature_dict['dataset/calculus/ch03/ch03.tex']

{'bold': [],
 'indices': ["l'H\\^{o}pital's rule",
  'continuous function',
  'indeterminate form',
  'derivative',
  'intermediate value theorem',
  'limit',
  "l'H\\^{o}pital's rule",
  'limit',
  'extreme value theorem',
  'composition'],
 'italicized': ['composition',
  'intermediate value theorem',
  'infinitely',
  'extreme value theorem',
  'con los',
  'equals',
  "didn't",
  'zero',
  'any',
  'not',
  'does',
  'ad hoc'],
 'large': [],
 'plaintext': 'b\' Limits and continuity ch:limits   Continuity   Intuitively, a continuous function is one whose graph has no sudden jumps in it; the graph is all a single connected piece.  Such a function can be drawn without picking the pen up off of the paper. Formally, a function  is defined to be continuous if for any real  and any infinitesimal ,  is infinitesimal.def-continuity continuous function   eg eg:discontinuous  Let the function  be defined by  for , and  for . Then  is discontinuous, since for , , which isn\\\'t infinitesimal. 

In [None]:
'''
with open('samplefile.txt') as book:
    read_book = book.read()
'''
try:
    process = subprocess.Popen(['detex', 'ch01.tex'], stdout=subprocess.PIPE)
except FileNotFoundError:
    raise FileNotFoundError("Please verify that detex is installed on your sysread_booktem. https://github.com/pkubowicz/opendetex")
    
output, err = process.communicate()
input_text = str(output)
input_text = ' '.join(input_text.split('\\n'))
print(input_text)
#doing text preprocessing
word_list = text_processing.process_text(input_text)

In [None]:
frequency = collections.Counter()
for w in word_list:
    frequency[w] += 1
    
tagged = nltk.pos_tag(frequency.keys())

notpos = ['VB','VBP','PRP','IN','RB','DT','WDT','WP','WRB','UH','TO','RBR','RBS','POS','MD','EX']
w1 = list(filter(lambda word_tag: word_tag[1]  not in notpos, tagged))

word = []
pos = []
count = []
df =pd.DataFrame()

dict ={}
for each in w1:
    #dict[each[0]]=each[1]
    word.append(each[0])
    pos.append(each[1])
df['word']=word
df['pos']=pos
for each in word:
    count.append(frequency[each])
df['wordcount'] = count

cols = ['NN','NNP','NNS','VBG','VBD','VBN','VBZ','VBP','VB','CD','CC','LS','JJ','JJS','JJR','PDT','PRP','RP']
df1 = pd.DataFrame(0, index=np.arange(len(df['word'])), columns=cols)
df2 = pd.concat([df, df1], axis=1)

for idx, row in df2.iterrows():
    pos = row['pos']
    df2.set_value(idx, pos, 1)
print(df2.head(10))

In [None]:
#for i in range(110):

#test only this. The other files are on my local machine and they will fail
i=1
with open('ch01.tex') as input:
    data = input.read()
    
indices = set(re.findall(r'\\index{(.*)}',data))
print(indices)
#we will include sub indexing, formatting inside indexing, labels etc for final model
indices = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in indices]
sections = set(re.findall(r'\\section{(.*)}',data))
sections = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in sections]
subsections = set(re.findall(r'\\subsection{(.*)}',data))
subsections = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in subsections]

italicized = re.findall(r'\\emph{(.*?)}',data)
italicized.extend(re.findall(r'\\textit{(.*)}',data))
#italicized.extend(re.findall(r'\\em([^}].*)',data))


bold = re.findall(r'\\textbf{(.*)}',data)

underline = re.findall(r'\\uline{(.*)}',data)
underline.extend(re.findall(r'\\uwave{(.*)}',data))

large = re.findall(r'(\\large|\\Large|\\LARGE|\\huge|\\Huge)',data)

#\emph, \textit, {\em ...}
#\textbf
#\uline{..}. , \uwave{...}
#\large,\Large,\LARGE,\huge,\Huge

print(indices)
print(sections)
print(subsections)
print("bold")
print(bold)
print("italicized")
print(italicized)
print("underline")
print(underline)
print("large")
print(large)



#initialise the columns
df2['index'] = 0
df2['section'] = 0
df2['subsection'] = 0
df2['italicized'] = 0
df2['bold'] = 0
df2['underline'] = 0
df2['large'] = 0
#mark indices
df2.loc[df2['word'].isin(indices),'index']=1
print(df2[df2['index']==1])
#mark sections
df2.loc[df2['word'].isin(sections),'section']=1
print(df2[df2['section']==1])
#mark subsections
df2.loc[df2['word'].isin(subsections),'subsection']=1
print(df2[df2['subsection']==1])
df2.loc[df2['word'].isin(italicized),'italicized']=1
print(df2[df2['italicized']==1])
df2.loc[df2['word'].isin(bold),'bold']=1
print(df2[df2['bold']==1])
df2.loc[df2['word'].isin(underline),'underline']=1
print(df2[df2['underline']==1])
df2.loc[df2['word'].isin(large),'large']=1
print(df2[df2['large']==1])

In [None]:
def classifier():
    data = pd.DataFrame()
    data = pd.read_csv("datasets\dataframe.csv")
    data.fillna('0', inplace=True)
    #Splitting the features and traget variable
    y = data['index']
    data.drop('index', 1, inplace=True)
    data.drop('word', 1, inplace=True)
    data.drop('pos', 1, inplace=True)
    X = data
    #Initialising the model
    rfr= RandomForestClassifier()
    #Fitting the data on the model
    rfr.fit(X, y)
    #Saving the model
    saveModel(rfr)
    pred = rfr.predict(X)
    Evaluate_accuracy(pred, y)
    return

In [None]:
def predict(model):
    data = pd.DataFrame()
    data = pd.read_csv("datasets\dataframe.csv")
    data.fillna('0', inplace=True)
    
    #Splitting the features and traget variable
    y = data['index']
    data.drop('index', 1, inplace=True)
    data.drop('word', 1, inplace=True)
    data.drop('pos', 1, inplace=True)
    X = data
    
    pred = model.predict(X)
    Evaluate_accuracy(pred, y)
    return

In [None]:
def saveModel(model):
    joblib.dump(model, 'newlinear.model')
    return

In [None]:
def loadModel():
    model = joblib.load('newlinear.model')
    return

In [None]:
def Evaluate_accuracy(pred, true_value):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(pred)): 
        if true_value[i]==1 and pred[i]==1:
           TP += 1
    for i in range(len(pred)): 
        if pred[i]==1 and true_value[i]==pred[i]:
           FP += 1
    for i in range(len(pred)): 
        if true_value[i]==pred[i]==0:
           TN += 1
    for i in range(len(pred)): 
        if pred[i]==0 and true_value[i]==pred[i]:
           FN += 1
    print("TP, FP, TN, FN   :", TP, FP, TN, FN)
    print("Accuracy score is ", accuracy_score(true_value, pred)*100)
    rmse = np.sqrt(mean_squared_error(true_value, pred))
    print("Root Mean Squared Error: {}".format(rmse))
    print("Mean absolute error:", mean_absolute_error(true_value,pred))
    print "Micro stats:"
    print precision_recall_fscore_support(true_value, pred, average='micro')
    print "Macro stats:"
    print precision_recall_fscore_support(true_value, pred, average='macro')
    

    cr= classification_report(true_value, pred)
    xticks = ['precision', 'recall', 'f1-score', 'support']
    yticks = list(np.unique(true_value))
    yticks += ['avg']
    rep = np.array(precision_recall_fscore_support(true_value, pred)).T
    avg = np.mean(rep, axis=0)
    avg[-1] = np.sum(rep[:, -1])
    rep = np.insert(rep, rep.shape[0], avg, axis=0)
    plt.title('Classification Report (Normalized)')
    rep =rep.astype('float') / rep.sum(axis=1)[:, np.newaxis]
    sns.heatmap(rep, annot=True, xticklabels=xticks, yticklabels=yticks)
    plt.show()
    

    cm=confusion_matrix(true_value,pred)

    plt.title('Confusion matrix: Not Normalized')
    sns.heatmap(cm, annot=True, linewidths=.5)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    plt.title('Confusion matrix: Normalized')
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm, annot=True, linewidths=.5)
    plt.xlabel('Predicted value')
    plt.ylabel('True value')
    plt.show()
    return

In [None]:
def main():
    classifier()
    model = loadModel()
    #predict(model)
    

In [None]:
if __name__ == "__main__":
    main()

In [None]:
#Plotting graph to show pos of words occuring in index
data= pd.read_csv("datasets\dataframe.csv")
ds =pd.DataFrame()
ds['index']= data['index']
ds['pos'] = data['pos']
list_pos = []
for idx, row in ds.iterrows():
    if row['index']==1:
        list_pos.append(row['pos'])
new = pd.DataFrame(list_pos)

plt.grid(True)
new.hist()
plt.title("count of Parts of speech of words appearing in Index")
plt.ylabel('count')
plt.xlabel('POS')
plt.show()


### done so far. kept the Rake eg below untouched.

## Rake Example:
Below is a sample code that gets ranked phrases from rake. This is mainly used to generate keyphrases. We can fine tune this

In [None]:
from rake_nltk import Rake
r = Rake()
r.extract_keywords_from_text(input_text)
print(r.get_ranked_phrases())