In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk as nl
from sklearn import naive_bayes
from nltk.classify.scikitlearn import SklearnClassifier


nl.download('punkt')
nl.download('averaged_perceptron_tagger')
nl.download('cmudict')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sava\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sava\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Sava\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [2]:
df = pd.read_csv('data\poems\haiku.csv')
df = df.tail(10000)

df = df[['0', '1', '2']]

# get the CMU Pronouncing Dictionary
transcr = nl.corpus.cmudict.dict()

# concatenate data of all columns into one column and word tokenize
df['0'] = df['0'].apply(lambda x: [w.lower() for w in nl.tokenize.word_tokenize(x) if w.isalpha()])
df['1'] = df['1'].apply(lambda x: [w.lower() for w in nl.tokenize.word_tokenize(x) if w.isalpha()])
df['2'] = df['2'].apply(lambda x: [w.lower() for w in nl.tokenize.word_tokenize(x) if w.isalpha()])

def get_syllables(word):
    if word in transcr:
        pron = transcr[word][0]

        # get the number of syllables in the word
        syllable_count = sum(y.isdigit() for x in pron for y in x)
        return syllable_count
    return 0

# check the amount of syllables in the sentence
for ind, row in df.iterrows():
    sentence_syl_count = sum(get_syllables(w) for w in row['0'])
    sentence_syl_count += sum(get_syllables(w) for w in row['1'])
    sentence_syl_count += sum(get_syllables(w) for w in row['2'])

    if(sentence_syl_count != 17):
        df = df.drop(ind)  # haiku must have 17 syllables

print(df.shape)

print(df.head())
# tag the tokenized sentences

df['joined_sents'] = df.apply(lambda x: [*x['0'], *x['1'], *x['2']], axis=1)
df['sent_len'] = df.apply(lambda x: [len(x['0']), len(x['1']), len(x['2'])], axis=1)


df['tagged_sents'] = df['joined_sents'].apply(lambda x: nl.pos_tag(x))

df.head(5)

(8472, 3)
                            0                                        1  \
134123   [i, have, a, really]        [cute, outfit, for, tonight, but]   
134124   [we, can, hold, our]      [hands, together, through, the, oh]   
134125   [you, my, hero, man]  [get, well, soon, i, loved, you, since]   
134126  [my, biggest, guilty]           [pleasure, is, probably, punk]   
134127  [yes, daddy, tie, me]    [up, choke, me, to, dead, and, leave]   

                                   2  
134123       [i, can, not, find, it]  
134124    [oh, through, the, oh, oh]  
134125  [here, in, the, real, world]  
134126      [covers, of, pop, songs]  
134127           [my, body, to, rot]  


Unnamed: 0,0,1,2,joined_sents,sent_len,tagged_sents
134123,"[i, have, a, really]","[cute, outfit, for, tonight, but]","[i, can, not, find, it]","[i, have, a, really, cute, outfit, for, tonigh...","[4, 5, 5]","[(i, NNS), (have, VBP), (a, DT), (really, RB),..."
134124,"[we, can, hold, our]","[hands, together, through, the, oh]","[oh, through, the, oh, oh]","[we, can, hold, our, hands, together, through,...","[4, 5, 5]","[(we, PRP), (can, MD), (hold, VB), (our, PRP$)..."
134125,"[you, my, hero, man]","[get, well, soon, i, loved, you, since]","[here, in, the, real, world]","[you, my, hero, man, get, well, soon, i, loved...","[4, 7, 5]","[(you, PRP), (my, PRP$), (hero, NN), (man, NN)..."
134126,"[my, biggest, guilty]","[pleasure, is, probably, punk]","[covers, of, pop, songs]","[my, biggest, guilty, pleasure, is, probably, ...","[3, 4, 4]","[(my, PRP$), (biggest, JJS), (guilty, JJ), (pl..."
134127,"[yes, daddy, tie, me]","[up, choke, me, to, dead, and, leave]","[my, body, to, rot]","[yes, daddy, tie, me, up, choke, me, to, dead,...","[4, 7, 4]","[(yes, RB), (daddy, JJ), (tie, NNS), (me, PRP)..."


In [6]:
def pos_features(sentence, i):
    features = {
        "suffix(1)": word[-1:],
        "suffix(2)": word[-2:],
        "suffix(3)": word[-3:],
        "pos": tag
    }

    features["syllable_sum"] = sum(get_syllables(word) for word in sentence[:i])

    features.update(get_n_word(sentence, i, -1))
    # features.update(get_n_word(sentence, i, -2))

    return features

def get_n_word(sentence, i, n):
    if i + n < 0:
        return {
            "SOS": True
        }
    return {
        f'{n}:word':sentence[i + n][0],
        f'{n}:word:pos':sentence[i + n][1],
    }
    

def get_next_word(sentence, i):
    if i + 1 >= len(sentence):
        return "<END>"
    return sentence[i+1]

featuresets = []
for ind, row in df.iterrows():
    featuresets.append([])
    for i, (word, tag) in enumerate(row['tagged_sents']):
        featuresets[-1].append((pos_features(row['tagged_sents'], i), get_next_word(row['joined_sents'], i)))

size = int(len(featuresets) * 0.1)

def flatten(t):
    return [item for sublist in t for item in sublist]

f_f = flatten(featuresets)

train_set, test_set = f_f[size:], f_f[:size]
classifier = nl.NaiveBayesClassifier.train(train_set)


In [8]:
classifier2 = SklearnClassifier(naive_bayes.MultinomialNB()).train(train_set)

In [10]:

prev_word = 'you'
postag = nl.tag.pos_tag([prev_word])[0][1]
poem = [(prev_word, postag)]

i = 0
while prev_word != '<END>' and i < 20:
    vector = pos_features(poem, i)
    prev_word =  classifier2.prob_classify(vector).generate()
    postag = nl.tag.pos_tag([prev_word])[0][1]
    poem.append((prev_word, postag))
    i += 1

poem

[('you', 'PRP'), ('i', 'NN'), ('to', 'TO'), ('<END>', 'NN')]

In [11]:
nl.classify.accuracy(classifier2, test_set)


0.1192443919716647

In [60]:
classifier.prob_classify(f_f[0][0]).generate()

'want'