# Classification (BESS Tag - Stage of Life)

In [2]:
#Importing required packages
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from gensim.models import word2vec

In [3]:
#Setting the working directory
os.chdir('E:/Capstone IATH/Code')

In [4]:
#Loading the data
bess_data = pd.read_csv('CBW_Bess_tags_final2.csv', encoding='ISO-8859-1', low_memory=False)
text_data = pd.read_csv('textdatanew.csv', encoding='ISO-8859-1', low_memory=False)
text_features = pd.read_csv('text_features.csv', encoding='ISO-8859-1', low_memory=False)

In [5]:
#Count of each class in Stage of Life bess tag
bess_data[bess_data.Type == 'stageOfLife'].Content.value_counts()

middle         8591
culmination    2707
beginning      2374
end            1021
after           780
before          219
Name: Content, dtype: int64

In [6]:
#Preparing the data for stage of life classification
#Converting bess_data so that each row is a para in a bio
#Taking into stage of life for now
stage_of_life = bess_data[bess_data.Type == 'stageOfLife']

In [7]:
#Removing other columns
stage_of_life.drop(columns=['Event', 'Type', 'URI', 'author', 'personaName', 'title'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [8]:
stage_of_life.rename(index=str, columns={"collectionID": "CollectionID", "biographyID": "BiographyID", "para no": "ParagraphNo", "Content": "StageOfLife"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [9]:
stage_of_life.columns

Index(['StageOfLife', 'ParagraphNo', 'BiographyID', 'CollectionID'], dtype='object')

In [10]:
stage_of_life = stage_of_life[['CollectionID', 'BiographyID', 'ParagraphNo', 'StageOfLife']]

In [11]:
stage_of_life.loc[:,'values'] = 1

In [12]:
stage_of_life = stage_of_life.pivot_table(index=['CollectionID', 'BiographyID', 'ParagraphNo'], columns = ['StageOfLife'], values = 'values').reset_index()

In [13]:
text_bess = pd.merge(text_data, stage_of_life, how='left', on=['CollectionID', 'BiographyID', 'ParagraphNo'])

In [14]:
text_bess.fillna(0, inplace=True)

In [15]:
#Text Processing

#Converting to lower case
text_bess['ParagraphText'] = text_bess['ParagraphText'].str.lower()

#Tokenizing the text in the data
wpt = nltk.WordPunctTokenizer()
text_bess['tokenized'] = text_bess['ParagraphText'].apply(lambda x: wpt.tokenize(x))

In [16]:
#Keeping only words
text_bess['tokenized'] = text_bess['tokenized'].apply(lambda x: [i for i in x if i.isalpha()])

#Removing stop words
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
text_bess['tokenized'] = text_bess['tokenized'].apply(lambda x: [i for i in x if not i in stop_words])

In [17]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text_bess['tokenized'] = text_bess['tokenized'].apply(lambda x: [wordnet_lemmatizer.lemmatize(i) for i in x])

In [18]:
#Creating word embeddings for train and test data
we = word2vec.Word2Vec(text_bess['tokenized'], size=300, min_count=10)

In [19]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

In [20]:
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(we.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [21]:
#Word embedded features for train and test
feature_array = averaged_word_vectorizer(corpus=text_bess['tokenized'], model=we,
                                             num_features=300)
wefeatures = pd.DataFrame(feature_array)

  if __name__ == '__main__':


In [22]:
#Final data
final_data = pd.concat([wefeatures, text_bess], axis=1)

In [23]:
final_data.drop(columns = ['CollectionID', 'BiographyID', 'ParagraphNo', 'ParagraphText', 'tokenized'], inplace=True)

# Classification Models

In [24]:
# Features and response
X = wefeatures
Y = final_data.iloc[:,300:306]

In [25]:
Y.columns

Index(['after', 'before', 'beginning', 'culmination', 'end', 'middle'], dtype='object')

In [26]:
# Seperate models for each stage of life (binary classification)
Y_1 = Y['after']
Y_2 = Y['before']
Y_3 = Y['beginning']
Y_4 = Y['culmination']
Y_5 = Y['end']
Y_6 = Y['middle']

In [30]:
# Random Forest Model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [31]:
rf_model = RandomForestClassifier(n_estimators=1000)

In [33]:
# classification model for after
x_train, x_test, y_train, y_test = train_test_split(X, Y_1, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_after = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_after))

0.9528245192307693


In [34]:
# classification model for before
x_train, x_test, y_train, y_test = train_test_split(X, Y_2, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_before = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_before))

0.98828125


In [35]:
# classification model for beginning
x_train, x_test, y_train, y_test = train_test_split(X, Y_3, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_beginning = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_beginning))

0.8680889423076923


In [36]:
# classification model for culmination
x_train, x_test, y_train, y_test = train_test_split(X, Y_4, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_culmination = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_culmination))

0.8578725961538461


In [37]:
# classification model for end
x_train, x_test, y_train, y_test = train_test_split(X, Y_5, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_end = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_end))

0.9477163461538461


In [38]:
# classification model for middle
x_train, x_test, y_train, y_test = train_test_split(X, Y_6, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_middle = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_middle))

0.6048677884615384
