In [1]:
import numpy as np
import pandas as pd

import re
import fileinput
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import PassiveAggressiveClassifier

In [2]:
corpus = open('corpus.txt', 'r', encoding='utf-8-sig').read()

In [3]:
begin_title = '\n{3,}\s+THE SECRET CACHE\n{3,}.*'
corpus = re.search(begin_title, corpus, flags=re.M+re.S).group()
corpus = corpus.replace('\n', ' ') 
corpus = re.sub(r' {2,}', ' ', corpus)
corpus = corpus.replace('----', '')

In [4]:
valid_forms = ['am','are','were','was','is','been','being','be']
blank = '----'

In [5]:
tokens = wordpunct_tokenize(corpus)

In [6]:
def detect(tokens):
    return [t for t in tokens if t in valid_forms]
    
def replace_blank(tokens):
    return [blank if t in valid_forms else t for t in tokens]

def create_windows(tokens, window_size=3):
    X = []
    for i, word in enumerate(tokens):
        if word == blank:
            window = tokens[i-window_size:i] + tokens[i+1:i+window_size+1]
            window = ' '.join(window)
            X.append(window)    
    return X

In [7]:
y = detect(tokens)
tokens = replace_blank(tokens)
X = create_windows(tokens)

In [8]:
X

['water . It the first day',
 '. Three canoes engaged in the',
 'the nearest canoe a fine figure',
 'the bank had holding his breath',
 ', when he close enough to',
 'close enough to heard above the',
 'think it may so , since',
 ', the boy running along the',
 '. Monsieur Cadotte alone , going',
 'Hugh Beaupré ” written in a',
 'that the writing not his father',
 '. The letter in French ,',
 '. The handwriting good , better',
 'not where he and the waves',
 '. But he sore hurt in',
 'that the furs for you and',
 '. “ It bad news ?”',
 '“ My father dead .” “',
 '’ s voice vibrant with sympathy',
 '. “ It not , I',
 'No , he wrecked .” Hugh',
 ', and he so sorely hurt',
 '. If there anything I can',
 'Montreal .” Hugh at a loss',
 'though Cadotte had , Hugh dared',
 'Jean Beaupré had a free trader',
 'his father had in the habit',
 'likely he had under some contract',
 'the fur companies hot and bitter',
 'bitter . Hugh very sure that',
 'that the pelts destined for the',
 '. The boy in

In [9]:
y

['was',
 'were',
 'was',
 'been',
 'was',
 'be',
 'be',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'are',
 'is',
 'is',
 'was',
 'was',
 'was',
 'was',
 'is',
 'was',
 'been',
 'been',
 'been',
 'been',
 'was',
 'was',
 'were',
 'was',
 'was',
 'is',
 'was',
 'was',
 'was',
 'be',
 'were',
 'are',
 'am',
 'be',
 'be',
 'be',
 'was',
 'been',
 'been',
 'be',
 'be',
 'been',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'be',
 'being',
 'was',
 'been',
 'be',
 'been',
 'was',
 'be',
 'was',
 'be',
 'was',
 'was',
 'was',
 'am',
 'was',
 'are',
 'is',
 'is',
 'is',
 'am',
 'is',
 'were',
 'are',
 'is',
 'be',
 'is',
 'was',
 'be',
 'was',
 'was',
 'were',
 'was',
 'were',
 'were',
 'was',
 'were',
 'was',
 'been',
 'was',
 'being',
 'be',
 'was',
 'been',
 'was',
 'are',
 'was',
 'been',
 'was',
 'been',
 'be',
 'were',
 'be',
 'was',
 'be',
 'be',
 'was',
 'was',
 'be',
 'were',
 'be',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'was',
 'were

In [10]:
l = LabelEncoder()
y = l.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
vectorizer = CountVectorizer()
classifier = PassiveAggressiveClassifier()
pipe = make_pipeline(vectorizer, classifier)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)



In [11]:
sample = str(input("Enter the text:\n"))
sample = wordpunct_tokenize(sample)
sample = create_windows(sample)
prediction = l.inverse_transform(pipe.predict(sample))
print("\n Predictions:")
with open('output.txt', 'w') as f:
    f.write('\n'.join(prediction))
    print('\n'.join(prediction))

Enter the text:
When the modern Olympics began in 1896, the initiators and organizers ---- looking for a great popularizing event, recalling the ancient glory of Greece. The idea of a marathon race came from Michel Breal, who wanted the event to feature in the first modern Olympic Games in 1896 in Athens. This idea was heavily supported by Pierre de Coubertin, the founder of the modern Olympics, as well as by the Greeks. The Greeks staged a selection race for the Olympic marathon on 10 March 1896 that ---- won by Charilaos Vasilakos in 3 hours and 18 minutes (with the future winner of the introductory Olympic Games marathon coming in fifth). The winner of the first Olympic Marathon, on 10 April 1896 (a male-only race), was Spyridon "Spyros" Louis, a Greek water-carrier, in 2 hours 58 minutes and 50 seconds. The women's marathon ---- introduced at the 1984 Summer Olympics (Los Angeles, USA) and ---- won by Joan Benoit of the United States with a time of 2 hours 24 minutes and 52 seconds

  if diff:
