In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## 1. Reading input datasets

### 1.1 Reading the text data

In [2]:
text_data_sentence = pd.read_csv('./Files/textdatanew.csv', encoding='ISO-8859-1')
text_data_sentence.head(5)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


### 1.2 Reading the text features

In [3]:
text_features = pd.read_csv("text_features.csv", encoding='ISO-8859-1')
text_features.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,CollectionID,BiographyID,ParagraphNo,sadness,joy,fear,disgust,anger,score,sentiment,...,Number,Organization,Person,PrintMedia,Quantity,Sport,SportingEvent,TelevisionShow,Time,Vehicle
0,a001,bio04,1,0.255896,0.558011,0.101166,0.111615,0.054668,0.290669,positive,...,,,Cleopatra,,,,,,,
1,a001,bio04,2,0.171629,0.257088,0.173474,0.098726,0.267978,0.0,neutral,...,,Roman senate,"Cleopatra, Julius Caesar, Pompey, Ptolemy",,,,,,,


### 1.3 Reading the Response file

In [4]:
bess_tags = pd.read_csv('CBW_Bess_tags_final2.csv')
bess_tags.head()

Unnamed: 0,Content,Event,Type,para no,URI,author,biographyID,collectionID,personaName,title
0,after,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
1,culmination,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
2,middle,name,stageOfLife,2.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
3,middle,name,stageOfLife,3.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
4,middle,name,stageOfLife,4.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...


### 2. Preprocessing Data

### 2.1 Getting the top Event types

In [5]:
bess_reponse = bess_tags.loc[:,['Content','Event','Type','para no','biographyID','collectionID']]

bess_reponse= bess_reponse.fillna(' ')

bess_reponse.loc[:,'Response'] = bess_reponse.loc[:,['Content','Event']].apply(lambda x: '_'.join(x),axis = 1)

bess_reponse['Bio_col_id'] = bess_reponse['biographyID'] +"_" + bess_reponse['collectionID']
bess_reponse['Bio_col_para_id'] = bess_reponse['Bio_col_id'] +"_" + bess_reponse['para no'].astype('str')

doc_count = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['Event'])].\
                         groupby(['Response'])['Bio_col_id'].apply(lambda x: len(np.unique(x))))

#############################################################################
##########TF - IDF Approach to get the top event types ######################
#############################################################################

term_freq = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['Event'])].\
                            groupby(['Response'])['Bio_col_id'].count())

total_docs = len(bess_reponse['Bio_col_id'].unique())

In [6]:
group_by_counts = pd.concat([term_freq,doc_count],axis = 1)

group_by_counts.columns = ['Term_freq','Doc_freq']
group_by_counts['tf_idf'] = pd.DataFrame(group_by_counts['Term_freq'] * np.log(total_docs/group_by_counts['Doc_freq']) )

group_by_counts.sort_values(['tf_idf'],ascending=False)[0:20]

Unnamed: 0_level_0,Term_freq,Doc_freq,tf_idf
Response,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"lover, male, named_agentType",776,75,1091.131263
hospital_locationStructure,617,71,901.378524
"sovereign, male_agentType",655,79,886.959898
"nobleman, named_agentType",853,122,784.388141
husband_agentType,1245,165,768.961337
"royalty, male_agentType",753,111,763.58334
conversation_type,1118,171,650.588288
"nursing, professional_type",281,33,625.808789
famous man_agentType,712,129,615.006161
"officer, military_agentType",540,98,614.853517


In [7]:
# bio_response = pd.DataFrame(bess_reponse.groupby(['Response'])['Bio_col_para_id'].apply(lambda x: len(np.unique(x))))
# bio_response.sort_values(['Bio_col_para_id'],ascending=False).head()

### 2.2 Creating the Respone Variable for the model

In [204]:
### Select the event for building the model
reponse_required = 'royalty, male_agentType'

In [205]:
reponse_required_to_merge = bess_reponse[bess_reponse.Response == reponse_required]

##### Merging the features and the response dataset
text_data_merge = pd.merge(text_data_sentence, reponse_required_to_merge.drop_duplicates(),\
                     how = 'left', left_on=['CollectionID','BiographyID','ParagraphNo'],
                         right_on=['collectionID','biographyID','para no'])


########## Final Data Frame #############
final_data_frame = text_data_merge.loc[:,['ParagraphText','Response']]

final_data_frame['Response_binary'] = np.where(final_data_frame.Response.isnull(),0,1)

##### Distribution of the Response variable

In [207]:
final_data_frame.Response_binary.value_counts()

0    16010
1      627
Name: Response_binary, dtype: int64

### 2.3 Prepocessing the Paragraph Text

##### 2.3.1 StopWord collection

#### Getting stop words - High Frequency and Low Frequency word list

In [208]:
from collections import Counter
from nltk.tokenize import word_tokenize

tokenized_para = final_data_frame.ParagraphText.apply(word_tokenize)

all_sent = [words for each_sent in tokenized_para for words in each_sent]

count_dict = Counter(all_sent)

high_freq_words = [word for (word,count) in count_dict.most_common(500)]

less_freq_words = []
threshold = 5

for k,v in count_dict.items():
    
    if v < threshold:
        less_freq_words.append(k)

        
######### List of all stop words ##########

stop_words = stopwords.words('english')
stop_words.extend(high_freq_words)
stop_words.extend(less_freq_words)

### 3. Model Building

#### 3.1 Creating Training and Test datasets

In [209]:
X_train, X_test, y_train, y_test = train_test_split(final_data_frame.ParagraphText ,final_data_frame.Response_binary,
                                                    test_size = 0.2, random_state = 0)

#### 3.2 Splitting the dataset into two categories

In [210]:
data_bin_1 = X_train[y_train == 1]
data_bin_0 = X_train[y_train == 0]

dict_bin_1_tokens = word_tokenize(' '.join(data_bin_1))
dict_bin_0_tokens = word_tokenize(' '.join(data_bin_0))

### Creating a dictionary for Counter 1
df_bin_1_tokens = pd.DataFrame(dict_bin_1_tokens)
dictionary_bin_1 = Counter(df_bin_1_tokens[~df_bin_1_tokens[0].isin(stop_words)][0].values)

### Creating a dictionary for Counter 1
df_bin_0_tokens = pd.DataFrame(dict_bin_0_tokens)
dictionary_bin_0 = Counter(df_bin_0_tokens[(~(df_bin_0_tokens[0].isin(stop_words)))][0].values)

In [211]:
################ Creating a dataframe of probabilites ######################
vocab_size = sum(dictionary_bin_1.values())
for k,v in dictionary_bin_1.items():
    dictionary_bin_1[k] = v/vocab_size
    
vocab_size_0 = sum(dictionary_bin_0.values())
for k_0,v_0 in dictionary_bin_0.items():
    dictionary_bin_0[k_0] = v_0/vocab_size_0

In [212]:
########### Creating a dictionary of all the words in each of the binary category 1 and 0 ##############

bin_1_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_1[each] for each in word_tokenize(x)]))

bin_0_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_0[each] for each in word_tokenize(x)]))

In [213]:
df_result = pd.DataFrame([bin_1_value,bin_0_value]).T
df_result.columns = ['bin_1_value','bin_0_value']

### 3.3 Language Model Result

In [214]:
keys_1_only = [each for each in dictionary_bin_1 if each not in dictionary_bin_0.keys()]

In [215]:
print("Vocab Size: ",len(keys_1_only) + len(dictionary_bin_0.keys()))

Vocab Size:  16788


In [216]:
(df_result.bin_0_value < df_result.bin_1_value).sum()

1204

###### Accuracy and Distribution

In [217]:
# 100 * (df_result.bin_0_value < df_result.bin_1_value).sum()/len(df_result.bin_1_value)

In [218]:
y_test.value_counts()

0    3194
1     134
Name: Response_binary, dtype: int64

###### Confusion Matrix

In [219]:
from sklearn.metrics import confusion_matrix

preds = (df_result.bin_0_value < df_result.bin_1_value).astype('int')

keys_1_only = [each for each in dictionary_bin_1 if each not in dictionary_bin_0.keys()]

print("Vocab Size: ", len(keys_1_only) + len(dictionary_bin_0.keys()))
print("Accuracy: ",(preds == y_test).sum()/len(y_test))
print("F1 score: ",f1_score(y_test,preds))

Vocab Size:  16788
Accuracy:  0.6706730769230769
F1 score:  0.18086696562032883


In [220]:
confusion_matrix(y_test, preds)

array([[2111, 1083],
       [  13,  121]], dtype=int64)

In [221]:
df_result[df_result.bin_0_value < df_result.bin_1_value].head()

Unnamed: 0,bin_1_value,bin_0_value
5704,0.034752,0.032477
7959,0.009278,0.008249
13407,0.002149,0.002135
4384,0.000839,0.000227
13743,0.003093,0.001797


In [222]:
#### Considering the words with higher probability in 1 and 0

primary_words = []

for each in dictionary_bin_0.keys():
    if dictionary_bin_0[each] < dictionary_bin_1[each]:
        primary_words.append(each)

###### Significant Words

In [223]:
primary_words[1:10]

['ton',
 'southern',
 'gentle-',
 'union',
 'occurred',
 'sublime',
 'hunt',
 'sovereign',
 'difference']

### 4. Modification - Including the words that are present for the binary class 1

In [224]:
data_bin_1 = X_train[y_train == 1]
data_bin_0 = X_train[y_train == 0]

dict_bin_1_tokens = word_tokenize(' '.join(data_bin_1))
dict_bin_0_tokens = word_tokenize(' '.join(data_bin_0))

### Creating a dictionary for Counter 1
df_bin_1_tokens = pd.DataFrame(dict_bin_1_tokens)
dictionary_bin_1 = Counter(df_bin_1_tokens[~df_bin_1_tokens[0].isin(stop_words)][0].values)

### Creating a dictionary for Counter 1
df_bin_0_tokens = pd.DataFrame(dict_bin_0_tokens)
dictionary_bin_0 = Counter(df_bin_0_tokens[(~(df_bin_0_tokens[0].isin(stop_words))) & \
                                           (df_bin_0_tokens[0].isin(dict_bin_1_tokens))][0].values)

In [225]:
################ Creating a dataframe of probabilites ######################

vocab_size = sum(dictionary_bin_1.values())
for k,v in dictionary_bin_1.items():
    dictionary_bin_1[k] = v/vocab_size
    
vocab_size_0 = sum(dictionary_bin_0.values())
for k_0,v_0 in dictionary_bin_0.items():
    dictionary_bin_0[k_0] = v_0/vocab_size_0

In [226]:
########### Creating a dictionary of all the words in each of the binary category 1 and 0 ##############

bin_1_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_1[each] for each in word_tokenize(x)]))

bin_0_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_0[each] for each in word_tokenize(x)]))

In [227]:
df_result = pd.DataFrame([bin_1_value,bin_0_value]).T
df_result.columns = ['bin_1_value','bin_0_value']

### 3.3 Language Model Result

In [228]:
#(df_result.bin_0_value < df_result.bin_1_value).sum()

In [229]:
y_test.value_counts()

0    3194
1     134
Name: Response_binary, dtype: int64

In [230]:
from sklearn.metrics import confusion_matrix

preds = (df_result.bin_0_value < df_result.bin_1_value).astype('int')

### Evaluation ###
print("Vocab Size", len(dictionary_bin_1.keys()))
print("Accuracy: ",(preds == y_test).sum()/len(y_test))
print("F1 score: ",f1_score(y_test,preds))

Vocab Size 7306
Accuracy:  0.8671875
F1 score:  0.2754098360655738


In [231]:
confusion_matrix(y_test, preds)

array([[2802,  392],
       [  50,   84]], dtype=int64)

In [232]:
df_result[df_result.bin_0_value < df_result.bin_1_value].head()

Unnamed: 0,bin_1_value,bin_0_value
16621,0.000105,7.8e-05
4384,0.000839,0.00027
13743,0.003093,0.002428
5354,0.009435,0.009319
11565,0.008491,0.008134


In [156]:
#### Considering the words with higher probability in 1 and 0

primary_words = []

for each in dictionary_bin_0.keys():
    if dictionary_bin_0[each] < dictionary_bin_1[each]:
        primary_words.append(each)

In [157]:
primary_words[1:10]

['plan',
 'union',
 'empress',
 'sublime',
 'plot',
 'hunt',
 'sovereign',
 'mistress',
 'Besides']