# Syllable Tokenization

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

tqdm.pandas()
set_config(display="diagram")

In [2]:
df = pd.read_csv('../input/modern-renaissance-poetry/all.csv').fillna('')
print(df.shape)
df.head()

(573, 5)


Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [3]:
tokenizer = SyllableTokenizer()
def syllable_tokenizer(text):
    return tokenizer.tokenize(text.lower())

def identity_tokenizer(text):
    return text

# df['content'] = df['content'].progress_apply(syllable_tokenizer)

In [4]:
get_model = lambda: Pipeline([
    ('vectorizer', ColumnTransformer([
        ('author', TfidfVectorizer(use_idf=False, sublinear_tf=True), 'author'),
        ('content', TfidfVectorizer(tokenizer=identity_tokenizer,  lowercase=False), 'content'),
        ('poem name', TfidfVectorizer(sublinear_tf=True), 'poem name')
    ])),
    ('estimator', LogisticRegression(max_iter=100_000, random_state=19))
])
get_model()

In [5]:
x = df[['author', 'content', 'poem name']]
y = df['age']

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=19)
scores = []
for train_index, valid_index in tqdm(skf.split(x, y), total=10):
    x_train, x_valid = x.iloc[train_index], x.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    model = get_model().fit(x_train, y_train)
    scores.append(model.score(x_valid, y_valid))
print(np.mean(scores))

  0%|          | 0/10 [00:00<?, ?it/s]

0.9912885662431943
