In [9]:
# Imports
import glob
import os
import string
import unicodedata

# Sklearn imports
import numpy
import pandas
import sklearn
import sklearn.linear_model
import sklearn.svm
import sklearn.ensemble

In [82]:
paragraph_break_positions = {'../../data/samples/lexpredict-contraxsuite-samples-master/agreements/construction/1000694_2002-03-15_AGREEMENT OF LEASE-W.M.RICKMAN CONSTRUCTION CO..txt':
    [7, 50, 71, 106, 130, 152, 172, 202, 228, 250, 708, 745, 764, 795, 889, 909, 953, 986, 1012, 1042, 1061, 1299, 1404, 1496, 1653, 1690, 1709, 1734, 1763, 1788, 1806, 1880, 2039, 2053, 2075, 
    2463, 2918, 4037, 4326, 4591, 4610, 4999, 5721, 6334, 6622, 7285, 7899, 7920, 8006, 8180, 
    8349, 8355, 8871, 10413, 10908, 10931, 11038, 11368, 11639, 11917, 12921, 13930, 13954, 
    14611, 15232, 15931, 16424, 17286, 17304, 17407, 17879, 18295, 19791, 20472, 21455, 22707,
    23126, 23615, 23910, 23931, 24538, 24569, 24650, 25105, 25539, 25848, 25873, 26087, 27182, 27203, 
    27365, 27576, 27672, 27778, 27999, 28059, 28181, 28316, 28458, 28827, 29247, 29546, 29565, 
    29706, 29840,  30130, 30747, 30767, 30772, 31113, 31421, 32240, 32259, 32385, 33874, 33994],
                             '../../data/samples/lexpredict-contraxsuite-samples-master/agreements/employment/1000736_2005-05-10_AMENDMENT TO EMPLOYMENT AGREEMENT.txt':
                             [7, 44, 58, 93, 317, 493, 606, 816, 1007, 1159, 1330, 1557,  2022, 2205, 2385, 3024, 3231, 3371, 3435],
}

In [83]:
paragraph_characters = []
paragraph_characters.extend(string.whitespace)
paragraph_characters.extend(string.punctuation)

def build_paragraph_start_features(text, position, position_window_pre, position_window_post, characters=paragraph_characters):
    """
    Build a feature vector for a given line ID with given parameters.
    """
    # Feature vector
    v = {}
    
    # Check start offset
    if position < position_window_pre:
        position_window_pre = position
        
    # Iterate through window
    for i in range (-position_window_pre, position_window_post+1):
        # Character
        try:
            pos_char = text[position+i]
            
            # Count characters
            v["char_is_alpha_{0}".format(i)] = 1 if unicodedata.category(pos_char).startswith("L") else 0
            v["char_is_number_{0}".format(i)] = 1 if unicodedata.category(pos_char).startswith("N") else 0
            v["char_is_punct_{0}".format(i)] = 1 if unicodedata.category(pos_char).startswith("P") else 0
            v["char_is_whitespace_{0}".format(i)] = 1 if unicodedata.category(pos_char).startswith("Z") else 0

            # Build character vector
            for c in characters:
                v["char_{0}_{1}".format(c, i)] = 1 if pos_char == c else 0

        except IndexError as e:
            v["char_is_alpha_{0}".format(i)] = None
            v["char_is_number_{0}".format(i)] = None
            v["char_is_punct_{0}".format(i)] = None
            v["char_is_whitespace_{0}".format(i)] = None
            
            # Build character vector
            for c in characters:
                v["char_{0}_{1}".format(c, i)] = None

    # Build character vector
    for c in characters:
        v["char_{0}".format(c)] = 1 if text[position] == c else 0
        
    return v


In [84]:
build_paragraph_start_features("This is a test.\nThis is another test.", 15, 3, 3)

{'char_\t': 0,
 'char_\t_-1': 0,
 'char_\t_-2': 0,
 'char_\t_-3': 0,
 'char_\t_0': 0,
 'char_\t_1': 0,
 'char_\t_2': 0,
 'char_\t_3': 0,
 'char_\n': 1,
 'char_\n_-1': 0,
 'char_\n_-2': 0,
 'char_\n_-3': 0,
 'char_\n_0': 1,
 'char_\n_1': 0,
 'char_\n_2': 0,
 'char_\n_3': 0,
 'char_\x0b': 0,
 'char_\x0b_-1': 0,
 'char_\x0b_-2': 0,
 'char_\x0b_-3': 0,
 'char_\x0b_0': 0,
 'char_\x0b_1': 0,
 'char_\x0b_2': 0,
 'char_\x0b_3': 0,
 'char_\x0c': 0,
 'char_\x0c_-1': 0,
 'char_\x0c_-2': 0,
 'char_\x0c_-3': 0,
 'char_\x0c_0': 0,
 'char_\x0c_1': 0,
 'char_\x0c_2': 0,
 'char_\x0c_3': 0,
 'char_\r': 0,
 'char_\r_-1': 0,
 'char_\r_-2': 0,
 'char_\r_-3': 0,
 'char_\r_0': 0,
 'char_\r_1': 0,
 'char_\r_2': 0,
 'char_\r_3': 0,
 'char_ ': 0,
 'char_ _-1': 0,
 'char_ _-2': 0,
 'char_ _-3': 0,
 'char_ _0': 0,
 'char_ _1': 0,
 'char_ _2': 0,
 'char_ _3': 0,
 'char_!': 0,
 'char_!_-1': 0,
 'char_!_-2': 0,
 'char_!_-3': 0,
 'char_!_0': 0,
 'char_!_1': 0,
 'char_!_2': 0,
 'char_!_3': 0,
 'char_"': 0,
 'char_"_-1

In [86]:
#### Model parameters
position_window_pre = 3
position_window_post = 3

# Setup feature and target data
feature_data = []
target_data = []

# Test file
file_name = '../../data/samples/lexpredict-contraxsuite-samples-master/agreements/employment/1000736_2005-05-10_AMENDMENT TO EMPLOYMENT AGREEMENT.txt'

# Iterate through files and test
file_buffer = open(file_name, "rb").read().decode("utf-8")
    
for pos_id in range(len(file_buffer)):
    if file_buffer[pos_id-1] in ["\n", "\r"]:
        char_cat = unicodedata.category(file_buffer[pos_id])
        if char_cat.startswith("N") or char_cat.startswith("L") or file_buffer[pos_id] in ["(", "[", "]", ")"]:
            #print((file_name, "paragraph", pos_id, file_buffer[(pos_id-10):pos_id] + "|" + file_buffer[pos_id:(pos_id+10)]))
            pass


In [87]:
#### Model parameters
position_window_pre = 5
position_window_post = 5

# Setup feature and target data
feature_data = []
target_data = []

# Iterate through files and test
for file_name in sorted(list(paragraph_break_positions.keys())):
    file_buffer = open(file_name, "rb").read().decode("utf-8")
    
    for pos_id in range(len(file_buffer)):
        #if file_buffer[pos_id-1] in ["\n", "\r"]:
        #char_cat = unicodedata.category(file_buffer[pos_id])
        #if char_cat.startswith("N") or char_cat.startswith("L") or file_buffer[pos_id] in ["(", "[", "]", ")"]:
        if pos_id in paragraph_break_positions[file_name]:    
            feature_data.append(build_paragraph_start_features(file_buffer, pos_id, position_window_pre, position_window_post))
            target_data.append(1)
            #print((file_name, "paragraph", pos_id, file_buffer[(pos_id-10):pos_id] + "|" + file_buffer[pos_id:(pos_id+10)]))
        else:
            if numpy.random.random() <= 0.1:
                feature_data.append(build_paragraph_start_features(file_buffer, pos_id, position_window_pre, position_window_post))
                target_data.append(0)
                #print((file_name, "random", pos_id, file_buffer[(pos_id-10):(pos_id+10)]))
        
# Convert to DF
feature_df = pandas.DataFrame(feature_data).fillna(-1)

In [88]:
print(feature_df.shape)
print(len(target_data))

(3895, 500)
3895


In [89]:
# Build model
model_svc = sklearn.svm.SVC(kernel='linear', probability=True)
model_svc.fit(feature_df, target_data)

# Assess model
predicted_svc = model_svc.predict(feature_df)
print(sklearn.metrics.classification_report(target_data, predicted_svc))
print(sklearn.metrics.f1_score(target_data, predicted_svc))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3760
          1       0.97      1.00      0.99       135

avg / total       1.00      1.00      1.00      3895

0.985401459854


In [90]:
# Build model
model_pac = sklearn.linear_model.PassiveAggressiveClassifier()
model_pac.fit(feature_df, target_data)

# Assess model
predicted_pac = model_pac.predict(feature_df)
print(sklearn.metrics.classification_report(target_data, predicted_pac))
print(sklearn.metrics.f1_score(target_data, predicted_pac))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3760
          1       0.97      1.00      0.99       135

avg / total       1.00      1.00      1.00      3895

0.985401459854


In [91]:
# Build model
model_log = sklearn.linear_model.LogisticRegressionCV()
model_log.fit(feature_df, target_data)

# Assess model
predicted_log = model_log.predict(feature_df)
print(sklearn.metrics.classification_report(target_data, predicted_log))
print(sklearn.metrics.f1_score(target_data, predicted_log))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3760
          1       0.99      1.00      0.99       135

avg / total       1.00      1.00      1.00      3895

0.992647058824


In [92]:
# Build model
model_sgd = sklearn.linear_model.SGDClassifier(loss="perceptron")
model_sgd.fit(feature_df, target_data)

# Assess model
predicted_sgd = model_sgd.predict(feature_df)
print(sklearn.metrics.classification_report(target_data, predicted_sgd))
print(sklearn.metrics.f1_score(target_data, predicted_sgd))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3760
          1       0.99      1.00      0.99       135

avg / total       1.00      1.00      1.00      3895

0.992647058824


In [93]:
# Build model
model_et = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50)
model_et.fit(feature_df, target_data)

# Assess model
predicted_et = model_et.predict(feature_df)
print(sklearn.metrics.classification_report(target_data, predicted_et))
print(sklearn.metrics.f1_score(target_data, predicted_et))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3760
          1       0.99      1.00      1.00       135

avg / total       1.00      1.00      1.00      3895

0.9963099631


In [94]:
# Build model
model_vote = sklearn.ensemble.VotingClassifier(estimators=[
    ('log', model_log),
    ('et', model_et)], voting='soft')
model_vote.fit(feature_df, target_data)

# Assess model
predicted_vote = model_vote.predict(feature_df)
print(sklearn.metrics.classification_report(target_data, predicted_vote))
print(sklearn.metrics.f1_score(target_data, predicted_vote))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      3760
          1       0.99      1.00      1.00       135

avg / total       1.00      1.00      1.00      3895

0.9963099631


In [95]:
# Set production model
from sklearn.externals import joblib
model = model_log
joblib.dump(model, "paragraph_segmenter.pickle")

['paragraph_segmenter.pickle']

In [96]:
# Test OOS
file_list = sorted(glob.glob("../../data/samples/lexpredict-contraxsuite-samples-master/agreements/employment/*.txt"))
test_lines = []
test_feature_data = []

for file_name in file_list[0:3]:
    file_buffer = open(file_name, "rb").read().decode("utf-8")
    test_feature_data = []
    for pos_id in range(min(10000, len(file_buffer))):
        test_feature_data.append(build_paragraph_start_features(file_buffer, pos_id, position_window_pre, position_window_post))
    test_feature_df = pandas.DataFrame(test_feature_data).fillna(-1)
    test_predicted_breaks = model.predict_proba(test_feature_df)
    break
    

# Predict page breaks
#test_feature_df = pandas.DataFrame(test_feature_data).fillna(-1)
#test_predicted_lines = model.predict_proba(test_feature_df)

In [1]:
predicted_df = pandas.DataFrame(test_predicted_breaks, columns=["prob_false", "prob_true"])
paragraph_breaks = predicted_df.loc[predicted_df["prob_true"] >= 0.5, :].index.tolist()

for i in range(len(paragraph_breaks)-1):
    p0 = paragraph_breaks[i]
    p1 = paragraph_breaks[i+1]
    paragraph = file_buffer[p0:p1].strip().replace("\n", " ").replace("  ", " ")
    print(paragraph)
    print("="*32)

NameError: name 'pandas' is not defined