In [67]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter

### Reading the text data

In [2]:
text_data_sentence = pd.read_csv('./Files/textdatanew.csv', encoding='ISO-8859-1')

In [3]:
text_data_sentence.head(5)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


### Reading the text features

In [4]:
text_features = pd.read_csv("text_features.csv", encoding='ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
text_features.head(2)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,sadness,joy,fear,disgust,anger,score,sentiment,...,Number,Organization,Person,PrintMedia,Quantity,Sport,SportingEvent,TelevisionShow,Time,Vehicle
0,a001,bio04,1,0.255896,0.558011,0.101166,0.111615,0.054668,0.290669,positive,...,,,Cleopatra,,,,,,,
1,a001,bio04,2,0.171629,0.257088,0.173474,0.098726,0.267978,0.0,neutral,...,,Roman senate,"Cleopatra, Julius Caesar, Pompey, Ptolemy",,,,,,,


### Reading the Response file

In [6]:
bess_tags = pd.read_csv('CBW_Bess_tags_final2.csv')

In [7]:
bess_tags.head()

Unnamed: 0,Content,Event,Type,para no,URI,author,biographyID,collectionID,personaName,title
0,after,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
1,culmination,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
2,middle,name,stageOfLife,2.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
3,middle,name,stageOfLife,3.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
4,middle,name,stageOfLife,4.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...


In [8]:
bess_reponse = bess_tags.loc[:,['Content','Event','Type','para no','biographyID','collectionID']]

In [9]:
bess_reponse= bess_reponse.fillna(' ')

In [10]:
bess_reponse.loc[:,'Response'] = bess_reponse.loc[:,['Content','Event']].apply(lambda x: '_'.join(x),axis = 1)

In [11]:
bess_reponse['Bio_col_id'] = bess_reponse['biographyID'] +"_" + bess_reponse['collectionID']
bess_reponse['Bio_col_para_id'] = bess_reponse['Bio_col_id'] +"_" + bess_reponse['para no'].astype('str')

In [12]:
doc_count = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['Event'])].\
                         groupby(['Response'])['Bio_col_id'].apply(lambda x: len(np.unique(x))))

#doc_count

In [13]:
term_freq = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['Event'])].\
                            groupby(['Response'])['Bio_col_id'].count())

In [14]:
total_docs = len(bess_reponse['Bio_col_id'].unique())

In [15]:
group_by_counts = pd.concat([term_freq,doc_count],axis = 1)

group_by_counts.columns = ['Term_freq','Doc_freq']
group_by_counts['tf_idf'] = pd.DataFrame(group_by_counts['Term_freq'] * np.log(total_docs/group_by_counts['Doc_freq']) )

group_by_counts.sort_values(['tf_idf'],ascending=False)[0:10]

Unnamed: 0_level_0,Term_freq,Doc_freq,tf_idf
Response,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"lover, male, named_agentType",776,75,1091.131263
hospital_locationStructure,617,71,901.378524
"sovereign, male_agentType",655,79,886.959898
"nobleman, named_agentType",853,122,784.388141
husband_agentType,1245,165,768.961337
"royalty, male_agentType",753,111,763.58334
conversation_type,1118,171,650.588288
"nursing, professional_type",281,33,625.808789
famous man_agentType,712,129,615.006161
"officer, military_agentType",540,98,614.853517


### Binary Classification

In [68]:
bio_response = pd.DataFrame(bess_reponse.groupby(['Response'])['Bio_col_para_id'].apply(lambda x: len(np.unique(x))))
bio_response.sort_values(['Bio_col_para_id'],ascending=False).head()

Unnamed: 0_level_0,Bio_col_para_id
Response,Unnamed: 1_level_1
middle_name,8459
culmination_name,2695
beginning_name,2365
evaluation_type,2286
"emphasis in typeface, punctuation_type",2140


In [22]:
text_data_sentence.head()

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


#### Concatenating the text and response variables

In [107]:
reponse_required = 'husband_agentType'
reponse_required_to_merge = bess_reponse[bess_reponse.Response == reponse_required]

In [108]:
text_data_merge = pd.merge(text_data_sentence, reponse_required_to_merge.drop_duplicates(),\
                     how = 'left', left_on=['CollectionID','BiographyID','ParagraphNo'],
                         right_on=['collectionID','biographyID','para no'])

In [109]:
final_data_frame = text_data_merge.loc[:,['ParagraphText','Response']]

In [110]:
final_data_frame['Response_binary'] = np.where(final_data_frame.Response.isnull(),0,1)
final_data_frame.head()

Unnamed: 0,ParagraphText,Response,Response_binary
0,"A FRENCH philosopher, moralizing on the great ...",,0
1,Cleopatra was joint heir to the throne of Egyp...,,0
2,Cleopatra might have responded with a brillian...,,0
3,Caesar was then above fifty years of age. His ...,,0
4,For three years Cleopatra reigned with little ...,,0


In [111]:
final_data_frame.Response_binary.value_counts()

0    15617
1     1020
Name: Response_binary, dtype: int64

### Text - Preprocessing

#### Getting stop words - High Frequency and Low Frequency word list

In [112]:
from collections import Counter
from nltk.tokenize import word_tokenize

In [113]:
tokenized_para = final_data_frame.ParagraphText.apply(word_tokenize)

In [114]:
all_sent = [words for each_sent in tokenized_para for words in each_sent]

In [115]:
count_dict = Counter(all_sent)

In [116]:
high_freq_words = [word for (word,count) in count_dict.most_common(500)]

In [117]:
less_freq_words = []
threshold = 5

for k,v in count_dict.items():
    
    if v < threshold:
        less_freq_words.append(k)

In [118]:
stop_words = stopwords.words('english')
stop_words.extend(high_freq_words)
stop_words.extend(less_freq_words)

### Creating Training and Test datasets

In [119]:
X_train, X_test, y_train, y_test = train_test_split(final_data_frame.ParagraphText ,final_data_frame.Response_binary,
                                                    test_size = 0.2, random_state = 0)

### Splitting the dataset into two categories

In [120]:
data_bin_1 = X_train[y_train == 1]
data_bin_0 = X_train[y_train == 0]

#### Bag of Words

In [121]:
dictionary_bin_1 = Counter(word_tokenize(' '.join(data_bin_1)))
dictionary_bin_0 = Counter(word_tokenize(' '.join(data_bin_0)))

In [128]:
bin_1_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_1[each] if (each in dictionary_bin_1.keys() and each not in stop_words)\
                                else 0 for each in word_tokenize(x)]))

In [129]:
bin_0_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_0[each] if (each in dictionary_bin_0.keys() and each not in stop_words)\
                                else 0 for each in word_tokenize(x)]))

In [132]:
df_result = pd.DataFrame([bin_1_value,bin_0_value]).T
df_result.columns = ['bin_1_value','bin_0_value']

In [133]:
(df_result.bin_0_value < df_result.bin_1_value).sum()

0

In [2]:
#X_test.head()

In [134]:
df_result

Unnamed: 0,bin_1_value,bin_0_value
6202,111,1314
13128,137,1411
16621,9,68
7057,451,5264
5704,1107,12119
4664,150,1908
14089,158,1875
4542,424,4135
142,160,2128
15739,54,628
