In [211]:
import numpy as np
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
from collections import Counter
import pickle

In [212]:
d = pd.read_csv('./abcnews-date-text.csv')
data = d#[-10000:]

In [213]:
len(data)

1103665

In [221]:
data.tail()

Unnamed: 0,publish_date,headline_text
1103660,20171231,the ashes smiths warners near miss liven up bo...
1103661,20171231,timelapse: brisbanes new year fireworks
1103662,20171231,what 2017 meant to the kids of australia
1103663,20171231,what the papodopoulos meeting may mean for ausus
1103664,20171231,who is george papadopoulos the former trump ca...


In [215]:
class HeadlineGenerator(object):
    def save(self, file_name):
        with open(file_name, "wb") as f:
            pickle.dump({"pi": self._pi, "second": self._second, "a": self._a}, f)
    def load(self, file_name):
        with open(file_name, "rb") as f:
            params = pickle.load(f)
            self._pi = params["pi"]
            self._second = params["second"]
            self._a = params["a"]
    
    def _probabilities(self, tokens):
        n = len(tokens)
        probs = {}
        for token in tokens:
            probs[token] = probs.get(token, 0.) + 1
        for token, count in probs.items():
            probs[token] = count / n
        return probs
    
    def fit(self, headlines):
        initial = {}
        second = {}
        A = {}
        
        tokenized_headlines = [wordpunct_tokenize(headline) for headline in headlines]
        for headline in tokenized_headlines:
            word0 = headline[0]
            initial[word0] = initial.get(word0, 0) + 1
            
            if not len(headline) > 1:
                second[word0] = second.get(word0, []) + ["<END>"]
                continue
            
            word1 = headline[1]
            second[word0] = second.get(word0, []) + [word1]
            
            for idx in range(2, len(headline)):
                word = headline[idx]
                A[(word0, word1)] = A.get((word0, word1), []) + [word]
                word0 = word1
                word1 = word
            A[(word0, word1)] = A.get((word0, word1), []) + ["<END>"]
            
        
        for state, tokens in initial.items():
            initial[state] = initial[state] / len(headlines)
        for state, tokens in second.items():
            second[state] = self._probabilities(tokens)
        for state, tokens in A.items():
            A[state] = self._probabilities(tokens)
        
        self._pi = initial
        self._second = second
        self._a = A
    
    def _get_random_token(self, d):
        return np.random.choice(list(d.keys()), p = list(d.values()))
    
    def generate(self, random_state):
        np.random.seed(random_state)
        
        words = [self._get_random_token(self._pi)]
        words += [self._get_random_token(self._second[words[0]])]
        
        while words[-1] != "<END>":
            prev_words = (words[-2], words[-1])
            words += [self._get_random_token(self._a[prev_words])]
            
            
        
        return ' '.join(words[:-1])
            

In [216]:
generator = HeadlineGenerator()
generator.fit(data.headline_text)
generator.save("markov_model.p")

In [219]:
generator = HeadlineGenerator()
generator.load("markov_model.p")

In [220]:
generator.generate(random_state = 4200000)

'freo fans prepare for clarence council to reduce wrongful convictions under abolished laws'