In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
import scipy as sp

## 1. Reading the text data

In [4]:
text_data_sentence = pd.read_csv('./Files/textdatanew.csv', encoding='ISO-8859-1')
text_data_sentence.head(5)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


## 2. Reading the text features

In [15]:
text_features = pd.read_csv("text_features.csv", encoding='ISO-8859-1')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
text_features.head(2)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,sadness,joy,fear,disgust,anger,score,sentiment,...,Number,Organization,Person,PrintMedia,Quantity,Sport,SportingEvent,TelevisionShow,Time,Vehicle
0,a001,bio04,1,0.255896,0.558011,0.101166,0.111615,0.054668,0.290669,positive,...,,,Cleopatra,,,,,,,
1,a001,bio04,2,0.171629,0.257088,0.173474,0.098726,0.267978,0.0,neutral,...,,Roman senate,"Cleopatra, Julius Caesar, Pompey, Ptolemy",,,,,,,


### 3. Reading the Response file

In [8]:
bess_tags = pd.read_csv('CBW_Bess_tags_final2.csv')

In [9]:
bess_tags.head()

Unnamed: 0,Content,Event,Type,para no,URI,author,biographyID,collectionID,personaName,title
0,after,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
1,culmination,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
2,middle,name,stageOfLife,2.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
3,middle,name,stageOfLife,3.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
4,middle,name,stageOfLife,4.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...


## 4. Preprocessing BESS Response file

In [10]:
bess_reponse = bess_tags.loc[:,['Content','Event','Type','para no','biographyID','collectionID']]
bess_reponse= bess_reponse.fillna(' ')

### Creating a new column for the response variable
bess_reponse.loc[:,'Response'] = bess_reponse.loc[:,['Content','Event']].apply(lambda x: '_'.join(x),axis = 1)


### Concatenating columns to create new columns
bess_reponse['Bio_col_id'] = bess_reponse['biographyID'] +"_" + bess_reponse['collectionID']
bess_reponse['Bio_col_para_id'] = bess_reponse['Bio_col_id'] +"_" + bess_reponse['para no'].astype('str')

#### 4.1 Selecting the top BESS reponses for events based on TF-IDF method

In [12]:
doc_count = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['Event'])].\
                         groupby(['Response'])['Bio_col_id'].apply(lambda x: len(np.unique(x))))

term_freq = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['Event'])].\
                            groupby(['Response'])['Bio_col_id'].count())

total_docs = len(bess_reponse['Bio_col_id'].unique())

##### 4.2 Grouping by the term frequencies to get the top values

In [14]:
group_by_counts = pd.concat([term_freq,doc_count],axis = 1)

group_by_counts.columns = ['Term_freq','Doc_freq']
group_by_counts['tf_idf'] = pd.DataFrame(group_by_counts['Term_freq'] * np.log(total_docs/group_by_counts['Doc_freq']) )

group_by_counts.sort_values(['tf_idf'],ascending=False)[0:10]

Unnamed: 0_level_0,Term_freq,Doc_freq,tf_idf
Response,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"lover, male, named_agentType",776,75,1091.131263
hospital_locationStructure,617,71,901.378524
"sovereign, male_agentType",655,79,886.959898
"nobleman, named_agentType",853,122,784.388141
husband_agentType,1245,165,768.961337
"royalty, male_agentType",753,111,763.58334
conversation_type,1118,171,650.588288
"nursing, professional_type",281,33,625.808789
famous man_agentType,712,129,615.006161
"officer, military_agentType",540,98,614.853517


### 5. Preparing Final Respone File

#### 5.1 Getting a distribution of all the responses

In [16]:
bio_response = pd.DataFrame(bess_reponse.groupby(['Response'])['Bio_col_para_id'].apply(lambda x: len(np.unique(x))))
bio_response.sort_values(['Bio_col_para_id'],ascending=False).head(10)

Unnamed: 0_level_0,Bio_col_para_id
Response,Unnamed: 1_level_1
middle_name,8459
culmination_name,2695
beginning_name,2365
evaluation_type,2286
"emphasis in typeface, punctuation_type",2140
present tense_type,1645
"quotation, agent's speech, unique_type",1606
description of agent or identified set of agents_type,1535
"direct address, use of we_type",1481
city_locationSetting,1180


### 5.2 Selecting the response to Analyse

In [18]:
reponse_required = 'lover, male, named_agentType'

In [21]:
reponse_required_to_merge = bess_reponse[bess_reponse.Response == reponse_required]

### Merging the response with the text data file
text_data_merge = pd.merge(text_data_sentence, reponse_required_to_merge.drop_duplicates(),\
                     how = 'left', left_on=['CollectionID','BiographyID','ParagraphNo'],
                         right_on=['collectionID','biographyID','para no'])

final_data_frame = text_data_merge.loc[:,['ParagraphText','Response']]
final_data_frame['Response_binary'] = np.where(final_data_frame.Response.isnull(),0,1)
final_data_frame.head()

Unnamed: 0,ParagraphText,Response,Response_binary
0,"A FRENCH philosopher, moralizing on the great ...",,0
1,Cleopatra was joint heir to the throne of Egyp...,,0
2,Cleopatra might have responded with a brillian...,,0
3,Caesar was then above fifty years of age. His ...,"lover, male, named_agentType",1
4,For three years Cleopatra reigned with little ...,,0


In [28]:
final_data_frame.Response_binary.value_counts()

0    16054
1      583
Name: Response_binary, dtype: int64

## 6. Text Data - Preprocessing on the Final Response file

### 6.1 Getting stop words
#### High Frequency and Low Frequency word list

In [45]:
tokenized_para = final_data_frame.ParagraphText.apply(word_tokenize)

all_sent = [words for each_sent in tokenized_para for words in each_sent]

count_dict = Counter(all_sent)
high_freq_words = [word for (word,count) in count_dict.most_common(500)]

#### Getting Low Frequency words - based on a threshold
less_freq_words = []
threshold = 5

for k,v in count_dict.items():
    
    if v < threshold:
        less_freq_words.append(k)
        
stop_words = stopwords.words('english')
stop_words.extend(high_freq_words)
stop_words.extend(less_freq_words)

#### 6.2 Bag of Words

In [46]:
bow_model = CountVectorizer(ngram_range= (1,2),stop_words=stop_words)
Para_text_bow = bow_model.fit_transform(final_data_frame.ParagraphText)

features = bow_model.get_feature_names()

## 7. Model Building

### 7.1 Splitting data into train and test

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(Para_text_bow ,final_data_frame.Response_binary,
                                                    test_size = 0.3, random_state = 0)

In [1]:
# features = bow_model.get_feature_names()
# features.extend(['Sentiment'])
# features.extend(emotional_features.columns.values)

### 7.2 Machine Learning Models

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

  from numpy.core.umath_tests import inner1d


In [35]:
#knn_model = KNeighborsClassifier(n_neighbors= 3, p = 1.5)
rf_model = RandomForestClassifier(n_estimators= 50)
lr_model = LogisticRegression(multi_class='multinomial')
#rf_model = SVC(C = 10, kernel = 'poly')

In [36]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 7.3 Reviewing the response

In [43]:
pd.DataFrame(rf_model.predict(X_test))[0].value_counts()

0    4986
1       6
Name: 0, dtype: int64

##### Almost all the values are predicited as 1. Now looking at the confusion matrix, even all the predicted ones are not correct

In [44]:
from sklearn.metrics import confusion_matrix
preds = pd.DataFrame([np.argmax(each) if each.sum() != 0 else 10 for each in rf_model.predict(X_test)])[0]

confusion_matrix(y_test,preds)

array([[   3,    0, 4808],
       [   3,    0,  178],
       [   0,    0,    0]], dtype=int64)

In [47]:
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                  index = features,
                                  columns=['importance']).sort_values('importance',ascending=False)

feature_importances.head(20)

Unnamed: 0,importance
musset,0.003544
duel,0.002111
almachildes,0.002069
chopin,0.001784
st croix,0.001782
lovers,0.001664
george sand,0.001391
pleaded,0.001342
emmet,0.001299
liszt,0.001205
