# Poem Classification Problem

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
import re
from bs4 import BeautifulSoup

warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('Poem_classification - train_data.csv')
test = pd.read_csv('Poem_classification - test_data.csv')

# Check the duplicates and null values

In [6]:
print(f"Train duplicates: {train.duplicated().sum()}")
print(f"Test duplicates: {test.duplicated().sum()}")

Train duplicates: 4
Test duplicates: 0


In [8]:
print(f"Train nulls:\n {train.isnull().sum()}")
print(f"Test nulls:\n {test.isnull().sum()}")

Train nulls:
 Genre    0
Poem     4
dtype: int64
Test nulls:
 Genre    0
Poem     0
dtype: int64


In [10]:
# Drop the duplicates and null values
train.drop_duplicates(inplace=True)
train.dropna(inplace=True)

In [13]:
train.head(2)

Unnamed: 0,Genre,Poem
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...


# Preprocessing

In [16]:
from nltk.corpus import stopwords

In [18]:
# Lower the text
train['Poem'] = train['Poem'].apply(lambda x: x.lower())
test['Poem'] = test['Poem'].apply(lambda x: x.lower())

# Remove special characters
train['Poem'] = train['Poem'].apply(lambda x: re.sub(r'[^a-z A-z 0-9]+', '', x))
test['Poem'] = test['Poem'].apply(lambda x: re.sub(r'[^a-z A-z 0-9]+', '', x))

# Remove stopwords 
train['Poem'] = train['Poem'].apply(lambda x: " ".join([i for i in x.split() if i not in stopwords.words('english')]))
test['Poem'] = test['Poem'].apply(lambda x: " ".join([i for i in x.split() if i not in stopwords.words('english')]))

# Remove URLs 
train['Poem'] = train['Poem'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x))
test['Poem'] = test['Poem'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x))

# Remove HTML
train['Poem'] = train['Poem'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
test['Poem'] = test['Poem'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

# Remove Extra spaces
train['Poem'] = train['Poem'].apply(lambda x: " ".join(x.split()))
test['Poem'] = test['Poem'].apply(lambda x: " ".join(x.split()))



# Lemmatizer


In [22]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def text_lemma(text):
    return " ".join([lemmatizer.lemmatize(i) for i in text.split()])

In [24]:
train['Poem'] = train['Poem'].apply(lambda x: text_lemma(x))
test['Poem'] = test['Poem'].apply(lambda x: text_lemma(x))

# Encoding

In [27]:
train['Genre'].unique()

array(['Music', 'Death', 'Affection', 'Environment'], dtype=object)

In [29]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Music', 'Death', 'Affection', 'Environment']])
train['Genre'] = oe.fit_transform(train[['Genre']])
test['Genre'] = oe.transform(test[['Genre']])

In [31]:
train['Genre'] = train['Genre'].astype(int)
test['Genre'] = test['Genre'].astype(int)

# Train test split

In [36]:
X_train = train.drop(columns='Genre')
y_train = train['Genre']
X_test = test.drop(columns='Genre')
y_test = test['Genre']

In [38]:
X_train = X_train['Poem'].to_list()
X_test = X_test['Poem'].to_list()

# Word2Vec

In [41]:
from gensim.models import Word2Vec

model = Word2Vec(X_train)

In [44]:
import pickle

with open('word2vec.model', 'rb') as f:
    wv2 = pickle.load(f)

In [47]:
from tqdm import tqdm

def avg_w2v(sent, model):
    vectors = [model.wv[word] for word in sent if word in model.wv.index_to_key]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [49]:
X_train_vec = np.array([avg_w2v(sent, model) for sent in tqdm(X_train)])
X_test_vec = np.array([avg_w2v(sent, model) for sent in tqdm(X_test)])

100%|███████████████████████████████████████| 835/835 [00:00<00:00, 8481.53it/s]
100%|██████████████████████████████████████| 150/150 [00:00<00:00, 10012.02it/s]


In [63]:
# GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

gnb = GaussianNB()
gnb.fit(X_train_vec, y_train)
y_pred = gnb.predict(X_test_vec)
accuracy_score(y_test, y_pred)

0.44666666666666666