In [57]:
import subprocess
import pandas as pd
import re
import glob
import sys

In [17]:
features = ["indices","sections","subsections","italicized","bold","underline","large"]

In [18]:
def feature_extraction(data):
    dict = {}
    indices = set(re.findall(r'\\index{(.*)}',data))
    #we will include sub indexing, formatting inside indexing, labels etc for final model
    indices = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in indices]
    sections = set(re.findall(r'\\section{(.*)}',data))
    sections = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in sections]
    subsections = set(re.findall(r'\\subsection{(.*)}',data))
    subsections = [re.sub(r'(}.*?index|}.*?label|!).*','',string) for string in subsections]
    
    italicized = re.findall(r'\\emph{(.*?)}',data)
    italicized.extend(re.findall(r'\\textit{(.*)}',data))
    #italicized.extend(re.findall(r'\\em([^}].*)',data))
    
    
    bold = re.findall(r'\\textbf{(.*)}',data)
    
    underline = re.findall(r'\\uline{(.*)}',data)
    underline.extend(re.findall(r'\\uwave{(.*)}',data))
    
    large = re.findall(r'(\\large|\\Large|\\LARGE|\\huge|\\Huge)',data)
    dict["indices"] = indices
    dict["sections"] = sections
    dict["subsections"] = subsections
    dict["italicized"] = italicized
    dict["bold"] = bold
    dict["underline"] = underline
    dict["large"] = large
    return dict

In [46]:
input_text={}

for filename in glob.glob("calculus/*/*.tex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

for key in input_text:
    print(key)
print(input_text["calculus/ch03/ch03.tex"])

calculus/ch03/ch03.tex
calculus/ch02/ch02.tex
calculus/ch09/ch09.tex
calculus/ch05/ch05.tex
calculus/ch07/ch07.tex
calculus/ch08/ch08.tex
calculus/ch01/ch01.tex
calculus/ch04/ch04.tex
calculus/ch06/ch06.tex
{'sections': ['Limits at infinity', 'Another perspective on indeterminate forms', 'Limits', "L'H\\^{o", 'Continuity', "Generalizations of l'H\\^{o"], 'bold': [], 'large': [], 'underline': [], 'indices': ['limit', 'derivative', 'intermediate value theorem', "l'H\\^{o}pital's rule", 'extreme value theorem', 'composition', "l'H\\^{o}pital's rule", 'indeterminate form', 'continuous function', 'limit'], 'subsections': ['The extreme value theorem', 'Multiple applications of the rule', 'The intermediate value theorem', 'Limits at infinity', 'The indeterminate form $\\infty/\\infty$'], 'italicized': ['composition', 'intermediate value theorem', 'infinitely', 'extreme value theorem', 'con los', 'equals', "didn't", 'zero', 'any', 'not', 'does', 'ad hoc']}


In [20]:

for filename in glob.glob("discover_physics/*/*.tex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)
    


In [21]:
for filename in glob.glob("fundamentals-of-calculus/*/*.rbtex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [22]:
for filename in glob.glob("general_relativity/*/*.rbtex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [23]:
for filename in glob.glob("special_relativity/*/*.rbtex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [24]:

for filename in glob.glob("ThinkCPP/book/ch*.tex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [25]:
for filename in glob.glob("ThinkJava/thinkjava.tex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [52]:

for filename in glob.glob("javajavajava/texfiles/*.tex"):
    with open(filename,"r") as f:
        print(filename)
        data = f.read()
        input_text[filename] = feature_extraction(data)

javajavajava/texfiles/0.intro.tex
javajavajava/texfiles/1.programs.tex
javajavajava/texfiles/10.exceptions.tex
javajavajava/texfiles/11.files.tex
javajavajava/texfiles/11.intro.tex
javajavajava/texfiles/12.recursion.tex
javajavajava/texfiles/13.guis.tex
javajavajava/texfiles/14.threads.tex
javajavajava/texfiles/15.sockets.tex
javajavajava/texfiles/16.datastructs.tex
javajavajava/texfiles/2.objects.tex
javajavajava/texfiles/3.methods.tex
javajavajava/texfiles/4.uis.tex
javajavajava/texfiles/5.data.tex
javajavajava/texfiles/6.loops.tex
javajavajava/texfiles/7.strings.tex
javajavajava/texfiles/8.oop.tex
javajavajava/texfiles/9.arrays.tex


In [53]:

for filename in glob.glob("lm/lm/qm/*.rbtex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [54]:
for filename in glob.glob("lm/lm/vw/*.rbtex"):
    with open(filename) as f:
        data = f.read()
        input_text[filename] = feature_extraction(data)

In [55]:
len_dict = {}
import csv
with open('feature.csv', 'w') as f:  # Just use 'w' mode in 3.x
    w = csv.writer(f,delimiter=',')
    #w.writeheader()
    for key in input_text:
        len_dict[key]= len(input_text[key])
        w.writerow([key,input_text[key]])

In [58]:


csv.field_size_limit(sys.maxsize)
with open('feature.csv', 'r') as f:
    w = csv.reader(f,delimiter=',')
    for row in w:
        print(row[0])
        #row[1] contains the corresponding content.

lm/lm/vw/b.rbtex
javajavajava/texfiles/7.strings.tex
javajavajava/texfiles/1.programs.tex
javajavajava/texfiles/14.threads.tex
lm/lm/vw/a.rbtex
calculus/ch04/ch04.tex
javajavajava/texfiles/8.oop.tex
javajavajava/texfiles/10.exceptions.tex
javajavajava/texfiles/2.objects.tex
javajavajava/texfiles/0.intro.tex
lm/lm/qm/b.rbtex
javajavajava/texfiles/5.data.tex
lm/lm/vw/c.rbtex
calculus/ch07/ch07.tex
lm/lm/qm/c.rbtex
calculus/ch06/ch06.tex
lm/lm/vw/d.rbtex
calculus/ch03/ch03.tex
javajavajava/texfiles/13.guis.tex
lm/lm/qm/a.rbtex
javajavajava/texfiles/4.uis.tex
javajavajava/texfiles/16.datastructs.tex
javajavajava/texfiles/12.recursion.tex
calculus/ch09/ch09.tex
calculus/ch01/ch01.tex
javajavajava/texfiles/11.files.tex
javajavajava/texfiles/9.arrays.tex
calculus/ch08/ch08.tex
javajavajava/texfiles/11.intro.tex
calculus/ch02/ch02.tex
lm/lm/qm/d.rbtex
javajavajava/texfiles/15.sockets.tex
calculus/ch05/ch05.tex
javajavajava/texfiles/3.methods.tex
javajavajava/texfiles/6.loops.tex
