# Word Embeddings for Text

In [2]:
# Importing required packages
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from gensim.models import word2vec

In [3]:
# Setting the working directory
os.chdir('E:/Capstone IATH/Code')

## Reading and Preparing Data

In [22]:
# Loading the data
bess_data = pd.read_csv('CBW_Bess_tags_final2.csv', encoding='ISO-8859-1', low_memory=False)
text_features = pd.read_csv('text_features.csv', encoding='ISO-8859-1', low_memory=False)
text_topics = pd.read_csv('text_data_sentence_wtopics.csv', encoding='ISO-8859-1', low_memory=False)

In [23]:
#Creating unique identifier for each row: CollectionID + BiographyID + ParagraphNo
text_features['key'] = text_features['CollectionID'].astype(str) + '_' + text_features['BiographyID'].astype(str) +'_' + text_features['ParagraphNo'].astype(str)

# Using sentiment, emotion, topics as features along with text
text_features.drop(columns=['CollectionID', 'BiographyID', 'ParagraphNo', 'Anatomy', 'Award', 'Company', 'Date', 'Drug', 'Measure', 'Movie', 'Number', 'Quantity', 'Sport', 'SportingEvent', 'TelevisionShow', 'Time', 'Vehicle', 'Crime', 'Facility', 'GeographicFeature', 'HealthCondition', 'JobTitle', 'Location', 'Organization', 'Person', 'PrintMedia'], inplace=True)

# Rearranging the columns
text_features = text_features[['key', 'ParagraphText', 'score', 'sentiment', 'sadness', 'joy', 'fear', 'disgust', 'anger']]

sentiment = pd.get_dummies(text_features['sentiment'])
text_features = pd.concat([text_features, sentiment], axis=1)
text_features.drop(columns=['sentiment'], inplace=True)

# Adding topics
text_features['Topic 0'] = text_topics['Topic 0']
text_features['Topic 1'] = text_topics['Topic 1']
text_features['Topic 2'] = text_topics['Topic 2']
text_features['Topic 3'] = text_topics['Topic 3']
text_features['Topic 4'] = text_topics['Topic 4']
text_features['Topic 5'] = text_topics['Topic 5']

In [24]:
#Preparing the data for stage of life classification
#Converting bess_data so that each row is a para in a bio
#Taking into stage of life for now
stage_of_life = bess_data[bess_data.Type == 'stageOfLife']
stage_of_life['para no'] = stage_of_life['para no'].astype(int)
stage_of_life.rename(index=str, columns={"collectionID": "CollectionID", "biographyID": "BiographyID", "para no": "ParagraphNo", "Content": "StageOfLife"}, inplace=True)

#Creating unique identifier for each row: CollectionID + BiographyID + ParagraphNo
stage_of_life['key'] = stage_of_life['CollectionID'].astype(str) + '_' + stage_of_life['BiographyID'].astype(str) +'_' + stage_of_life['ParagraphNo'].astype(str)

#Removing other columns
stage_of_life.drop(columns=['Event', 'Type', 'URI', 'author', 'personaName', 'title', 'CollectionID', 'BiographyID', 'ParagraphNo'], inplace=True)

stage_of_life = stage_of_life[['key', 'StageOfLife']]
stage_of_life.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [25]:
stage_of_life.StageOfLife.value_counts()

middle         8459
culmination    2695
beginning      2365
end            1019
after           777
before          213
Name: StageOfLife, dtype: int64

In [26]:
# Preparing the response variable
stage_of_life['StageOfLife'] = stage_of_life['StageOfLife'].astype('category')
stage = pd.get_dummies(stage_of_life['StageOfLife'])
stage_of_life = pd.concat([stage_of_life, stage], axis=1)
stage_of_life.drop(columns=['StageOfLife'], inplace=True)
stage_of_life.drop_duplicates(inplace=True)
stage_of_life = stage_of_life.groupby(['key'])['after', 'before', 'beginning', 'culmination', 'end', 'middle'].sum()
stage_of_life['key'] = stage_of_life.index
stage_of_life = stage_of_life[['key', 'after', 'before', 'beginning', 'culmination', 'end', 'middle']]
stage_of_life.reset_index(drop=True, inplace=True)

In [27]:
text_bess = pd.merge(text_features, stage_of_life, how='right', on=['key'])
text_bess.fillna(0, inplace=True)

## Text Preprocessing

In [28]:
#Text Processing
text_bess['ParagraphText'] = text_bess['ParagraphText'].astype(str)

#Converting to lower case
text_bess['ParagraphText'] = text_bess['ParagraphText'].str.lower()

#Tokenizing the text in the data
wpt = nltk.WordPunctTokenizer()
text_bess['tokenized'] = text_bess['ParagraphText'].apply(lambda x: wpt.tokenize(x))

In [29]:
#Keeping only words
text_bess['tokenized'] = text_bess['tokenized'].apply(lambda x: [i for i in x if i.isalpha()])

#Removing stop words
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
text_bess['tokenized'] = text_bess['tokenized'].apply(lambda x: [i for i in x if not i in stop_words])

In [30]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text_bess['tokenized'] = text_bess['tokenized'].apply(lambda x: [wordnet_lemmatizer.lemmatize(i) for i in x])

## Creating word embeddings

In [31]:
#Creating word embeddings for train and test data
we = word2vec.Word2Vec(text_bess['tokenized'], size=300, min_count=20)

In [32]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

In [33]:
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(we.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [34]:
#Word embedded features for train and test
feature_array = averaged_word_vectorizer(corpus=text_bess['tokenized'], model=we,
                                             num_features=300)
wefeatures = pd.DataFrame(feature_array)

  if __name__ == '__main__':


In [35]:
#Final data
final_data = pd.concat([wefeatures, text_bess], axis=1)

In [38]:
final_data.drop(columns = ['key', 'ParagraphText', 'tokenized'], inplace=True)

In [40]:
final_data.shape

(14923, 321)

## Classification Models

In [41]:
# Features and response
X = final_data.iloc[:,0:315]
Y = final_data.iloc[:,315:321]

In [42]:
Y.columns

Index(['after', 'before', 'beginning', 'culmination', 'end', 'middle'], dtype='object')

In [43]:
# Seperate models for each stage of life (binary classification)
Y_1 = Y['after']
Y_2 = Y['before']
Y_3 = Y['beginning']
Y_4 = Y['culmination']
Y_5 = Y['end']
Y_6 = Y['middle']

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Random Forest Model

In [45]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

In [46]:
rf_model = RandomForestClassifier(n_estimators=1000)

In [47]:
# classification model for after
x_train, x_test, y_train, y_test = train_test_split(X, Y_1, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_after = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_after))

0.9504187604690117


In [48]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_after).ravel()
(tn, fp, fn, tp)

(2835, 2, 146, 2)

In [49]:
# classification model for before
x_train, x_test, y_train, y_test = train_test_split(X, Y_2, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_before = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_before))

0.9899497487437185


In [50]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_before).ravel()
(tn, fp, fn, tp)

(2955, 0, 30, 0)

In [51]:
# classification model for beginning
x_train, x_test, y_train, y_test = train_test_split(X, Y_3, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_beginning = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_beginning))

0.8559463986599665


In [52]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_beginning).ravel()
(tn, fp, fn, tp)

(2502, 28, 402, 53)

In [53]:
# classification model for culmination
x_train, x_test, y_train, y_test = train_test_split(X, Y_4, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_culmination = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_culmination))

0.847571189279732


In [54]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_culmination).ravel()
(tn, fp, fn, tp)

(2455, 15, 440, 75)

In [55]:
# classification model for end
x_train, x_test, y_train, y_test = train_test_split(X, Y_5, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_end = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_end))

0.9463986599664992


In [56]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_end).ravel()
(tn, fp, fn, tp)

(2805, 3, 157, 20)

In [57]:
# classification model for middle
x_train, x_test, y_train, y_test = train_test_split(X, Y_6, test_size = 0.2, random_state = 0)
rf_model.fit(x_train, y_train)
predictions_middle = rf_model.predict(x_test)
print(accuracy_score(y_test, predictions_middle))

0.6422110552763819


In [58]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_middle).ravel()
(tn, fp, fn, tp)

(526, 675, 393, 1391)

## Naive Bayes Model

In [59]:
from sklearn.naive_bayes import GaussianNB

In [60]:
nb_model = GaussianNB()

In [61]:
# classification model for after
x_train, x_test, y_train, y_test = train_test_split(X, Y_1, test_size = 0.2, random_state = 0)
nb_model.fit(x_train, y_train)
predictions_after = nb_model.predict(x_test)
print(accuracy_score(y_test, predictions_after))

0.7581239530988275


In [62]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_after).ravel()
(tn, fp, fn, tp)

(2201, 636, 86, 62)

In [63]:
# classification model for before
x_train, x_test, y_train, y_test = train_test_split(X, Y_2, test_size = 0.2, random_state = 0)
nb_model.fit(x_train, y_train)
predictions_before = nb_model.predict(x_test)
print(accuracy_score(y_test, predictions_before))

0.6026800670016751


In [64]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_before).ravel()
(tn, fp, fn, tp)

(1774, 1181, 5, 25)

In [65]:
# classification model for beginning
x_train, x_test, y_train, y_test = train_test_split(X, Y_3, test_size = 0.2, random_state = 0)
nb_model.fit(x_train, y_train)
predictions_beginning = nb_model.predict(x_test)
print(accuracy_score(y_test, predictions_beginning))

0.6636515912897822


In [66]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_beginning).ravel()
(tn, fp, fn, tp)

(1733, 797, 207, 248)

In [67]:
# classification model for culmination
x_train, x_test, y_train, y_test = train_test_split(X, Y_4, test_size = 0.2, random_state = 0)
nb_model.fit(x_train, y_train)
predictions_culmination = nb_model.predict(x_test)
print(accuracy_score(y_test, predictions_culmination))

0.5748743718592965


In [68]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_culmination).ravel()
(tn, fp, fn, tp)

(1437, 1033, 236, 279)

In [69]:
# classification model for end
x_train, x_test, y_train, y_test = train_test_split(X, Y_5, test_size = 0.2, random_state = 0)
nb_model.fit(x_train, y_train)
predictions_end = nb_model.predict(x_test)
print(accuracy_score(y_test, predictions_end))

0.7675041876046901


In [70]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_end).ravel()
(tn, fp, fn, tp)

(2213, 595, 99, 78)

In [71]:
# classification model for middle
x_train, x_test, y_train, y_test = train_test_split(X, Y_6, test_size = 0.2, random_state = 0)
nb_model.fit(x_train, y_train)
predictions_middle = nb_model.predict(x_test)
print(accuracy_score(y_test, predictions_middle))

0.5916247906197655


In [72]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions_end).ravel()
(tn, fp, fn, tp)

(883, 318, 1429, 355)

## Linear Support Vector Model

In [73]:
# Linear SVC with SGD
from sklearn.linear_model import SGDClassifier

In [74]:
sv_model = SGDClassifier(max_iter=1000, tol=1e-3)

In [75]:
# classification model for after
x_train, x_test, y_train, y_test = train_test_split(X, Y_1, test_size = 0.2, random_state = 0)
sv_model.fit(x_train, y_train)
predictions_after = sv_model.predict(x_test)
print(accuracy_score(y_test, predictions_after))

0.9504187604690117


In [76]:
# classification model for before
x_train, x_test, y_train, y_test = train_test_split(X, Y_2, test_size = 0.2, random_state = 0)
sv_model.fit(x_train, y_train)
predictions_before = sv_model.predict(x_test)
print(accuracy_score(y_test, predictions_before))

0.9899497487437185


In [77]:
# classification model for beginning
x_train, x_test, y_train, y_test = train_test_split(X, Y_3, test_size = 0.2, random_state = 0)
sv_model.fit(x_train, y_train)
predictions_beginning = sv_model.predict(x_test)
print(accuracy_score(y_test, predictions_beginning))

0.847571189279732


In [78]:
# classification model for culmination
x_train, x_test, y_train, y_test = train_test_split(X, Y_4, test_size = 0.2, random_state = 0)
sv_model.fit(x_train, y_train)
predictions_culmination = sv_model.predict(x_test)
print(accuracy_score(y_test, predictions_culmination))

0.8274706867671692


In [79]:
# classification model for end
x_train, x_test, y_train, y_test = train_test_split(X, Y_5, test_size = 0.2, random_state = 0)
sv_model.fit(x_train, y_train)
predictions_end = sv_model.predict(x_test)
print(accuracy_score(y_test, predictions_end))

0.9407035175879397


In [80]:
# classification model for middle
x_train, x_test, y_train, y_test = train_test_split(X, Y_6, test_size = 0.2, random_state = 0)
sv_model.fit(x_train, y_train)
predictions_middle = sv_model.predict(x_test)
print(accuracy_score(y_test, predictions_middle))

0.6093802345058627
