In [1]:
import numpy as np
import pandas as pd
import scipy as sp

In [2]:
import nltk
from nltk.tokenize import word_tokenize, TreebankWordDetokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [3]:
import torch
import tensorflow as tf

# Importing Data

In [4]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
dataset = pd.read_csv('bbc_text_cls.csv')

In [7]:
data = dataset.iloc[:,0].values

# Sparse Alt

In [25]:
class AltSparse():
  def __init__(self):
    self.x = {}

  def update(self, prev, suff, token):
    if (prev, suff) in self.x.keys():
      if token in self.x[(prev, suff)].keys():
        self.x[(prev, suff)][token] += 1
      else:
        self.x[(prev, suff)][token] = 1
    else:
      self.x[(prev, suff)] = {}
      self.x[(prev, suff)][token] = 1

  def prob(self, prev, suff):
    table = self.x[(prev, suff)]
    label = list(table.keys())
    values = list(table.values())
    probs = np.array(values)
    probs = probs / np.sum(probs)
    return label, probs

# Model

In [61]:
class ArticleSpinner():
  def __init__(self):
    self.x = AltSparse()
    self.detokenize = TreebankWordDetokenizer()
    self.n = 0

  def fit(self, docs):
    for doc in docs:
      doc = word_tokenize(doc.lower())
      t_2 = ''
      t_1 = ''
      for i, token in enumerate(doc):
        if i == 0:
          t_2 = token
        elif i == 1:
          t_1 = token
        else:
          self.x.update(t_2, token, t_1)
          t_2 = t_1
          t_1 = token
      self.n += 1

  def generate_token(self, prev='', suff=''):
    labels, probs = self.x.prob(prev, suff)
    idx = np.random.choice(len(labels), p=probs)
    return labels[idx]


  def generate(self, doc):
    if type(doc) == str:
      doc = word_tokenize(doc.lower())
    else:
      doc = word_tokenize(doc[0].lower())
    new_doc = []
    t_2 = ''
    t_1 = ''
    i = 0
    for token in doc:
      if token != '':
        if i == 0:
          t_2 = token
          new_doc.append(token)
        elif i == 1:
          t_1 = token
        else:
          replacement = self.generate_token(t_2, token)
          new_doc.append(replacement)
          t_2 = t_1
          t_1 = token
        i+=1
    new_doc.append(token)
    return self.detokenize.detokenize(new_doc)

In [62]:
model = ArticleSpinner()

In [63]:
model.fit(data)

In [68]:
data[303]

'Bombardier chief to leave company\n\nShares in train and plane-making giant Bombardier have fallen to a 10-year low following the departure of its chief executive and two members of the board.\n\nPaul Tellier, who was also Bombardier\'s president, left the company amid an ongoing restructuring. Laurent Beaudoin, part of the family that controls the Montreal-based firm, will take on the role of CEO under a newly created management structure. Analysts said the resignations seem to have stemmed from a boardroom dispute. Under Mr Tellier\'s tenure at the company, which began in January 2003, plans to cut the worldwide workforce of 75,000 by almost a third by 2006 were announced. The firm\'s snowmobile division and defence services unit were also sold and Bombardier started the development of a new aircraft seating 110 to 135 passengers.\n\nMr Tellier had indicated he wanted to stay at the world\'s top train maker and third largest manufacturer of civil aircraft until the restructuring was

In [69]:
new_doc = model.generate(data[303])
new_doc

'bombardier chief to leave company collapsed in rome and plane-making giant bombardier have access to a five-week low at the creation of its chief executive and several thirds of key case . paul richards, it is also it\'s business," the company amid an ongoing evaluation . laurent beaudoin, east of a numbers that avoided the uk firm, will appear in the option of 1985 faces a newly created management systems . it said the resignations seem to have come from a recent dispute . "mr makarov\'s world at the vice-chairman, which confirmed in october 2004, designed to occupy the worldwide head of ambition by using a study by 2006 were wrong by monitoring women\'s auto division and video-on-demand services unit were later fell and bombardier started the emperor of other new aircraft seating 110 to whisk passengers . paul oaten had announced he aspired to be within the year\'s top car maker and third largest maker of the aircraft and the euro was announced . but he has not concerned with a bigg

In [70]:
import textwrap
print(textwrap.fill(new_doc, replace_whitespace=False, fix_sentence_endings=True))

bombardier chief to leave company collapsed in rome and plane-making
giant bombardier have access to a five-week low at the creation of its
chief executive and several thirds of key case . paul richards, it is
also it's business," the company amid an ongoing evaluation . laurent
beaudoin, east of a numbers that avoided the uk firm, will appear in
the option of 1985 faces a newly created management systems . it said
the resignations seem to have come from a recent dispute . "mr
makarov's world at the vice-chairman, which confirmed in october 2004,
designed to occupy the worldwide head of ambition by using a study by
2006 were wrong by monitoring women's auto division and video-on-
demand services unit were later fell and bombardier started the
emperor of other new aircraft seating 110 to whisk passengers . paul
oaten had announced he aspired to be within the year's top car maker
and third largest maker of the aircraft and the euro was announced .
but he has not concerned with a bigger s