# Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.options.display.max_columns = 10

# Data and EDA

In [3]:
data = pd.read_csv('PoemsData.csv')

In [4]:
data.head()

Unnamed: 0,activities,art_and_sciences,living,love,mythology_and_folklore,nature,poem,relationships,religion,social_commentaries
0,1.0,0.0,0.0,1.0,0.0,0.0,15 Years of Paris\r\nBy Katy Bohinc\r\nBy Katy...,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,"[lady in red] ""at 4:30 AM""\r\nBy Ntozake Shang...",0.0,0.0,1.0
2,0.0,1.0,1.0,1.0,0.0,1.0,geography test\r\nBy Saaro Umar\r\nBy Saaro Um...,1.0,0.0,1.0
3,1.0,1.0,0.0,1.0,0.0,1.0,From “Celestial House”\r\nBy Victoria Martinez...,1.0,0.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,Ode to Black Skin\r\nBy Ashanti Anderson\r\nBy...,0.0,1.0,1.0


In [13]:
print('Dataset Shape', data.shape)

Dataset Shape (455, 10)


In [15]:
y_cols = data.columns.tolist()
y_cols.remove('poem')
print('Target Classes:\n', y_cols)

Target Classes:
 ['activities', 'art_and_sciences', 'living', 'love', 'mythology_and_folklore', 'nature', 'relationships', 'religion', 'social_commentaries']


In [18]:
def insight(y):
    print('There are {} ({} %) poems classified as {}'.format(data[y].sum(), data[y].sum()/4.55, y))

In [21]:
print('Insights:')
for y in y_cols:
    insight(y)

Insights:
There are 100.0 (21.978021978021978 %) poems classified as activities
There are 100.0 (21.978021978021978 %) poems classified as art_and_sciences
There are 100.0 (21.978021978021978 %) poems classified as living
There are 100.0 (21.978021978021978 %) poems classified as love
There are 100.0 (21.978021978021978 %) poems classified as mythology_and_folklore
There are 100.0 (21.978021978021978 %) poems classified as nature
There are 102.0 (22.41758241758242 %) poems classified as relationships
There are 100.0 (21.978021978021978 %) poems classified as religion
There are 100.0 (21.978021978021978 %) poems classified as social_commentaries


# Data Preprocessing

## Cleaning

In [28]:
import nltk, re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

remove_punc = lambda x : re.sub(r"\W", ' ', x)

remove_num = lambda x : re.sub(r"\d", ' ', x)

remove_extra_spaces = lambda x : re.sub(r"\s+", ' ', x)

lower_case = lambda x : x.lower()

stop_words = set(nltk.corpus.stopwords.words('english'))
remove_stopwords = lambda x: ' '.join(word for word in x.split() if word not in stop_words)

ps = PorterStemmer()
ps_stem = lambda x: ' '.join(ps.stem(word) for word in x.split())

wnl = WordNetLemmatizer()
wnl_lemmatize = lambda x: ' '.join(wnl.lemmatize(word) for word in x.split())

def tag_pos(x):
    tag_list =  nltk.pos_tag(nltk.word_tokenize(x))
    pos = ""
    for t in tag_list:
        pos += t[0] +'(' + t[1] +')' + ' '
    return pos

def cleanText(x, rsw, stm, lem, tgps):
    x = remove_punc(x)
    x = remove_num(x)
    x = remove_extra_spaces(x)
    x = lower_case(x)
    if rsw:
        x = remove_stopwords(x)
    if stm:
        x = ps_stem(x)
    if lem:
        x = wnl_lemmatize(x)
    if tgps:
        x = tag_pos(x)
    return x

In [29]:
data['poem'] = data['poem'].apply(lambda x: cleanText(x, True, False, True, False))

## TFIDF Vectorizing

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
vectorizer = TfidfVectorizer()
vectorizer.fit(data['poem'])
poem_Vect = vectorizer.transform(data['poem'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


# Machine Learning

In [36]:
from sklearn.model_selection import cross_val_score

## Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

In [40]:
for y in y_cols:
    print(y)
    model = LogisticRegression()
    scores = cross_val_score(model, poem_Vect, data[y], cv=10)
    print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))
    print()

activities
Accuracy: 0.78 (+/- 0.00)

art_and_sciences
Accuracy: 0.78 (+/- 0.00)

living
Accuracy: 0.78 (+/- 0.00)

love
Accuracy: 0.78 (+/- 0.00)

mythology_and_folklore
Accuracy: 0.78 (+/- 0.00)

nature
Accuracy: 0.78 (+/- 0.00)

relationships
Accuracy: 0.78 (+/- 0.01)

religion
Accuracy: 0.78 (+/- 0.00)

social_commentaries
Accuracy: 0.78 (+/- 0.00)



## Multinomial Naive Bayes

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [45]:
for y in y_cols:
    print(y)
    model = MultinomialNB()
    scores = cross_val_score(model, poem_Vect, data[y], cv=10)
    print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))
    print()

activities
Accuracy: 0.78 (+/- 0.00)

art_and_sciences
Accuracy: 0.78 (+/- 0.00)

living
Accuracy: 0.78 (+/- 0.00)

love
Accuracy: 0.78 (+/- 0.00)

mythology_and_folklore
Accuracy: 0.78 (+/- 0.00)

nature
Accuracy: 0.78 (+/- 0.00)

relationships
Accuracy: 0.78 (+/- 0.01)

religion
Accuracy: 0.78 (+/- 0.00)

social_commentaries
Accuracy: 0.78 (+/- 0.00)



## Random Forest Classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
for y in y_cols:
    print(y)
    model = RandomForestClassifier()
    scores = cross_val_score(model, poem_Vect, data[y], cv=10)
    print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))
    print()

activities
Accuracy: 0.77 (+/- 0.02)

art_and_sciences
Accuracy: 0.77 (+/- 0.01)

living
Accuracy: 0.79 (+/- 0.01)

love
Accuracy: 0.79 (+/- 0.03)

mythology_and_folklore
Accuracy: 0.78 (+/- 0.01)

nature
Accuracy: 0.78 (+/- 0.02)

relationships
Accuracy: 0.78 (+/- 0.01)

religion
Accuracy: 0.78 (+/- 0.01)

social_commentaries
Accuracy: 0.78 (+/- 0.01)



## SGD Classifier

In [48]:
from sklearn.linear_model import SGDClassifier

In [49]:
for y in y_cols:
    print(y)
    model = SGDClassifier()
    scores = cross_val_score(model, poem_Vect, data[y], cv=10)
    print('Accuracy: %0.2f (+/- %0.2f)' %(scores.mean(), scores.std()))
    print()

activities




Accuracy: 0.77 (+/- 0.03)

art_and_sciences
Accuracy: 0.77 (+/- 0.01)

living
Accuracy: 0.80 (+/- 0.02)

love




Accuracy: 0.79 (+/- 0.04)

mythology_and_folklore
Accuracy: 0.78 (+/- 0.03)

nature
Accuracy: 0.78 (+/- 0.04)

relationships




Accuracy: 0.78 (+/- 0.03)

religion
Accuracy: 0.79 (+/- 0.03)

social_commentaries
Accuracy: 0.78 (+/- 0.03)



