In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split

## 1. Reading input datasets

### 1.1 Reading the text data

In [2]:
text_data_sentence = pd.read_csv('/Users/user/Documents/github/CBW/data/textdatanew.csv', encoding='ISO-8859-1')
text_data_sentence.head(5)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


### 1.2 Reading the text features

In [3]:
text_features = pd.read_csv("/Users/user/Documents/github/CBW/data/text_features.csv", encoding='ISO-8859-1')
text_features.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,CollectionID,BiographyID,ParagraphNo,sadness,joy,fear,disgust,anger,score,sentiment,...,Number,Organization,Person,PrintMedia,Quantity,Sport,SportingEvent,TelevisionShow,Time,Vehicle
0,a001,bio04,1,0.255896,0.558011,0.101166,0.111615,0.054668,0.290669,positive,...,,,Cleopatra,,,,,,,
1,a001,bio04,2,0.171629,0.257088,0.173474,0.098726,0.267978,0.0,neutral,...,,Roman senate,"Cleopatra, Julius Caesar, Pompey, Ptolemy",,,,,,,


### 1.3 Reading the Response file

In [4]:
bess_tags = pd.read_csv('/Users/user/Documents/github/CBW/data/CBW_Bess_tags_final2.csv')
bess_tags.head()

Unnamed: 0,Content,Event,Type,para no,URI,author,biographyID,collectionID,personaName,title
0,after,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
1,culmination,name,stageOfLife,1.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
2,middle,name,stageOfLife,2.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
3,middle,name,stageOfLife,3.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...
4,middle,name,stageOfLife,4.0,a001.bio04.bess.xml,Willis John Abbot,bio04,a001,Cleopatra,Cleopatra (B.C. 69-30): The World's Most Famou...


### 2. Preprocessing Data

### 2.1 Getting the top personaDesc types

In [5]:
bess_reponse = bess_tags.loc[:,['Content','Event','Type','para no','biographyID','collectionID']]

bess_reponse= bess_reponse.fillna(' ')

bess_reponse.loc[:,'Response'] = bess_reponse.loc[:,['Content','Event']].apply(lambda x: '_'.join(x),axis = 1)

bess_reponse['Bio_col_id'] = bess_reponse['biographyID'] +"_" + bess_reponse['collectionID']
bess_reponse['Bio_col_para_id'] = bess_reponse['Bio_col_id'] +"_" + bess_reponse['para no'].astype('str')

doc_count = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['personaDescription'])].\
                         groupby(['Response'])['Bio_col_id'].apply(lambda x: len(np.unique(x))))

#############################################################################
##########TF - IDF Approach to get the top event types ######################
#############################################################################

term_freq = pd.DataFrame(bess_reponse[bess_reponse.Type.isin(['personaDescription'])].\
                            groupby(['Response'])['Bio_col_id'].count())

total_docs = len(bess_reponse['Bio_col_id'].unique())

In [6]:
group_by_counts = pd.concat([term_freq,doc_count],axis = 1)

group_by_counts.columns = ['Term_freq','Doc_freq']
group_by_counts['tf_idf'] = pd.DataFrame(group_by_counts['Term_freq'] * np.log(total_docs/group_by_counts['Doc_freq']) )

group_by_counts.sort_values(['tf_idf'],ascending=False)[0:20]

Unnamed: 0_level_0,Term_freq,Doc_freq,tf_idf
Response,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beautiful_type,538,157,359.028542
dedicated or devoted_type,403,137,323.852483
self-sacrificing_type,214,73,306.688891
energetic or untiring_type,343,128,298.943309
"faith, confident in_type",181,68,272.238009
loving_type,236,97,271.134293
devout_type,176,70,259.615815
"skilled, in occupation_type",228,100,254.998601
brave_type,332,142,254.895671
happy_type,293,131,248.577619


In [7]:
# bio_response = pd.DataFrame(bess_reponse.groupby(['Response'])['Bio_col_para_id'].apply(lambda x: len(np.unique(x))))
# bio_response.sort_values(['Bio_col_para_id'],ascending=False).head()

### 2.2 Creating the Respone Variable for the model

In [8]:
### Select the event for building the model
reponse_required = 'beautiful_type'

In [9]:
reponse_required_to_merge = bess_reponse[bess_reponse.Response == reponse_required]

text_data_sentence.ParagraphNo = text_data_sentence.ParagraphNo.astype('int')
reponse_required_to_merge['para no'] = reponse_required_to_merge['para no'].astype('int')
##### Merging the features and the response dataset
text_data_merge = pd.merge(text_data_sentence, reponse_required_to_merge.drop_duplicates(),\
                     how = 'left', left_on=['CollectionID','BiographyID','ParagraphNo'],
                         right_on=['collectionID','biographyID','para no'])


########## Final Data Frame #############
final_data_frame = text_data_merge.loc[:,['ParagraphText','Response']]

final_data_frame['Response_binary'] = np.where(final_data_frame.Response.isnull(),0,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


##### Distribution of the Response variable

In [10]:
final_data_frame.Response_binary.value_counts()

0    16107
1      530
Name: Response_binary, dtype: int64

### 2.3 Prepocessing the Paragraph Text

##### 2.3.1 StopWord collection

#### Getting stop words - High Frequency and Low Frequency word list

In [11]:
from collections import Counter
from nltk.tokenize import word_tokenize

tokenized_para = final_data_frame.ParagraphText.apply(word_tokenize)

all_sent = [words for each_sent in tokenized_para for words in each_sent]

count_dict = Counter(all_sent)

high_freq_words = [word for (word,count) in count_dict.most_common(500)]

less_freq_words = []
threshold = 5

for k,v in count_dict.items():
    
    if v < threshold:
        less_freq_words.append(k)

        
######### List of all stop words ##########

stop_words = stopwords.words('english')
stop_words.extend(high_freq_words)
stop_words.extend(less_freq_words)

### 3. Model Building

#### 3.1 Creating Training and Test datasets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(final_data_frame.ParagraphText ,final_data_frame.Response_binary,
                                                    test_size = 0.2, random_state = 0)

#### 3.2 Splitting the dataset into two categories

In [13]:
data_bin_1 = X_train[y_train == 1]
data_bin_0 = X_train[y_train == 0]

dict_bin_1_tokens = word_tokenize(' '.join(data_bin_1))
dict_bin_0_tokens = word_tokenize(' '.join(data_bin_0))

dictionary_bin_1 = Counter([each for each in dict_bin_1_tokens if each not in stop_words])
dictionary_bin_0 = Counter([each for each in dict_bin_0_tokens if each not in stop_words])

In [14]:
################ Creating a dataframe of probabilites ######################
vocab_size = sum(dictionary_bin_1.values())
for k,v in dictionary_bin_1.items():
    dictionary_bin_1[k] = v/vocab_size
    
vocab_size_0 = sum(dictionary_bin_0.values())
for k_0,v_0 in dictionary_bin_0.items():
    dictionary_bin_0[k_0] = v_0/vocab_size_0

In [15]:
########### Creating a dictionary of all the words in each of the binary category 1 and 0 ##############

bin_1_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_1[each] if (each in dictionary_bin_1.keys() and each not in stop_words)\
                                else 0 for each in word_tokenize(x)]))

bin_0_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_0[each] if (each in dictionary_bin_0.keys() and each not in stop_words)\
                                else 0 for each in word_tokenize(x)]))

In [16]:
df_result = pd.DataFrame([bin_1_value,bin_0_value]).T
df_result.columns = ['bin_1_value','bin_0_value']

### 3.3 Language Model Result

In [17]:
(df_result.bin_0_value < df_result.bin_1_value).sum()

1274

###### Accuracy and Distribution

In [18]:
100 * (df_result.bin_0_value < df_result.bin_1_value).sum()/len(df_result.bin_1_value)

38.28125

In [19]:
y_test.value_counts()

0    3229
1      99
Name: Response_binary, dtype: int64

###### Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix

preds = (df_result.bin_0_value < df_result.bin_1_value).astype('int')
(preds == y_test).sum()/len(y_test)

0.6415264423076923

In [21]:
confusion_matrix(y_test, preds)

array([[2045, 1184],
       [   9,   90]])

In [22]:
df_result[df_result.bin_0_value < df_result.bin_1_value].head()

Unnamed: 0,bin_1_value,bin_0_value
13128,0.00489,0.003756
7057,0.014719,0.014046
5704,0.033262,0.032548
4664,0.005713,0.005036
142,0.00581,0.005628


In [23]:
#### Considering the words with higher probability in 1 and 0

primary_words = []

for each in dictionary_bin_0.keys():
    if dictionary_bin_0[each] < dictionary_bin_1[each]:
        primary_words.append(each)

###### Significant Words

In [24]:
primary_words[1:10]

['courage',
 'stuck',
 'ostensibly',
 'ton',
 'gentle-',
 'union',
 'probably',
 'hunt',
 'sovereign']

### 4. Modification - Including the words that are present for the binary class 1

In [25]:
data_bin_1 = X_train[y_train == 1]
data_bin_0 = X_train[y_train == 0]

dict_bin_1_tokens = word_tokenize(' '.join(data_bin_1))
dict_bin_0_tokens = word_tokenize(' '.join(data_bin_0))

dictionary_bin_1 = Counter([each for each in dict_bin_1_tokens if each not in stop_words])
dictionary_bin_0 = Counter([each for each in dict_bin_0_tokens if each not in stop_words and each in dict_bin_1_tokens])

In [26]:
################ Creating a dataframe of probabilites ######################

vocab_size = sum(dictionary_bin_1.values())
for k,v in dictionary_bin_1.items():
    dictionary_bin_1[k] = v/vocab_size
    
vocab_size_0 = sum(dictionary_bin_0.values())
for k_0,v_0 in dictionary_bin_0.items():
    dictionary_bin_0[k_0] = v_0/vocab_size_0

In [27]:
########### Creating a dictionary of all the words in each of the binary category 1 and 0 ##############

bin_1_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_1[each] if (each in dictionary_bin_1.keys() and each not in stop_words)\
                                else 0 for each in word_tokenize(x)]))

bin_0_value = X_test.apply(lambda x: \
                           sum([dictionary_bin_0[each] if (each in dictionary_bin_0.keys() and each not in stop_words)\
                                else 0 for each in word_tokenize(x)]))

In [28]:
df_result = pd.DataFrame([bin_1_value,bin_0_value]).T
df_result.columns = ['bin_1_value','bin_0_value']

### 3.3 Language Model Result

In [29]:
(df_result.bin_0_value < df_result.bin_1_value).sum()

465

In [30]:
y_test.value_counts()

0    3229
1      99
Name: Response_binary, dtype: int64

In [31]:
from sklearn.metrics import confusion_matrix

preds = (df_result.bin_0_value < df_result.bin_1_value).astype('int')
(preds == y_test).sum()/len(y_test)

0.8665865384615384

In [32]:
confusion_matrix(y_test, preds)

array([[2824,  405],
       [  39,   60]])

In [33]:
df_result[df_result.bin_0_value < df_result.bin_1_value].head()

Unnamed: 0,bin_1_value,bin_0_value
13128,0.00489,0.004762
4384,0.000387,0.000292
10243,0.014331,0.009127
13743,0.00213,0.00208
10356,0.014283,0.014111


In [34]:
#### Considering the words with higher probability in 1 and 0

primary_words = []

for each in dictionary_bin_0.keys():
    if dictionary_bin_0[each] < dictionary_bin_1[each]:
        primary_words.append(each)

In [35]:
primary_words[1:10]

['ostensibly',
 'gentle-',
 'union',
 'hunt',
 'sovereign',
 'recognize',
 'blood',
 'mistress',
 'rapt']