In [40]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
import scipy as sp
from sklearn.model_selection import train_test_split

# 1. Reading required input files


## 1.1 Reading the text data

In [6]:
text_data_sentence = pd.read_csv('textdatanew.csv', encoding='ISO-8859-1')
text_data_sentence.head(5)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText
0,a001,bio04,1,"A FRENCH philosopher, moralizing on the great ..."
1,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...
2,a001,bio04,3,Cleopatra might have responded with a brillian...
3,a001,bio04,4,Caesar was then above fifty years of age. His ...
4,a001,bio04,5,For three years Cleopatra reigned with little ...


## 1.2 Reading the text features

In [7]:
text_features = pd.read_csv("text_features.csv", encoding='ISO-8859-1')
text_features.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,CollectionID,BiographyID,ParagraphNo,sadness,joy,fear,disgust,anger,score,sentiment,...,Number,Organization,Person,PrintMedia,Quantity,Sport,SportingEvent,TelevisionShow,Time,Vehicle
0,a001,bio04,1,0.255896,0.558011,0.101166,0.111615,0.054668,0.290669,positive,...,,,Cleopatra,,,,,,,
1,a001,bio04,2,0.171629,0.257088,0.173474,0.098726,0.267978,0.0,neutral,...,,Roman senate,"Cleopatra, Julius Caesar, Pompey, Ptolemy",,,,,,,


## 1.3 Reading the Response file

In [11]:
bess_tags = pd.read_csv('CBW_Bess_tags_final2.csv')

## 2. Preprocessing Files

### 2.1 Preprocessing BESS Response file

In [12]:
#### Processing BESS Response file
bess_reponse = bess_tags.loc[:,['Content','Event','Type','para no','biographyID','collectionID']]
bess_reponse= bess_reponse.fillna(' ')

bess_reponse.loc[:,'Response'] = bess_reponse.loc[:,['Content','Event']].apply(lambda x: '_'.join(x),axis = 1)


### GEtting the top events from the response file
top_10_events = bess_reponse[bess_reponse.Type.isin(['Event'])].Response.value_counts()[0:10]
top_10_events

city_locationSetting            1351
husband_agentType               1245
conversation_type               1118
nobleman, named_agentType        853
father_agentType                 818
lover, male, named_agentType     776
royalty, male_agentType          753
famous man_agentType             712
writing letter_type              667
sovereign, male_agentType        655
Name: Response, dtype: int64

In [26]:
bess_response_df = bess_reponse.loc[:,['para no','biographyID','collectionID','Response','Type']].drop_duplicates()

#Creating final BESS respone file
final_response_file = bess_response_df[bess_response_df.Response.isin(top_10_events.index)]

#### Formatting the final response file

final_response_file.loc[:,'values'] = 1

final_response_wide = (final_response_file.pivot_table(index=['para no','biographyID','collectionID','Type'],\
                                       columns='Response', values='values').reset_index())

final_response_wide = final_response_wide.fillna(0)


### Merging with the text data file
text_data_sentence_final = pd.merge(text_data_sentence,final_response_wide, how = 'left', left_on = ['CollectionID','BiographyID','ParagraphNo'],\
                             right_on = ['collectionID','biographyID','para no'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [28]:
text_data_sentence_final = text_data_sentence_final[~text_data_sentence_final['para no'].isna()]
text_data_sentence_final_response = text_data_sentence_final[text_data_sentence_final.columns[-10:]]

### Final data with the text features

text_features.drop(['ParagraphText'],axis = 1,inplace = True)
text_data_sentence_final_2 = pd.merge(text_data_sentence_final,text_features,how = 'left',
                                 on=['CollectionID', 'BiographyID', 'ParagraphNo'])

In [29]:
text_data_sentence_final_2.head(3)

Unnamed: 0,CollectionID,BiographyID,ParagraphNo,ParagraphText,para no,biographyID,collectionID,Type,city_locationSetting,conversation_type,...,Number,Organization,Person,PrintMedia,Quantity,Sport,SportingEvent,TelevisionShow,Time,Vehicle
0,a001,bio04,2,Cleopatra was joint heir to the throne of Egyp...,2.0,bio04,a001,Event,1.0,0.0,...,,Roman senate,"Cleopatra, Julius Caesar, Pompey, Ptolemy",,,,,,,
1,a001,bio04,3,Cleopatra might have responded with a brillian...,3.0,bio04,a001,Event,1.0,0.0,...,,,"Apollodorus, Caesar, Cleopatra",,,,,,,
2,a001,bio04,4,Caesar was then above fifty years of age. His ...,4.0,bio04,a001,Event,1.0,0.0,...,,,"Brutus, Caesar, Cleopatra, Julius, Ptolemy",,fifty years,,,,,


### 2.2 Preprocessing Text file

#### 2.2.1 Getting stop words - High Frequency and Low Frequency word list

In [34]:
tokenized_para = text_data_sentence_final_2.ParagraphText.apply(word_tokenize)

all_sent = [words for each_sent in tokenized_para for words in each_sent]
count_dict = Counter(all_sent)
high_freq_words = [word for (word,count) in count_dict.most_common(500)]

less_freq_words = []
threshold = 5

for k,v in count_dict.items():
    
    if v < threshold:
        less_freq_words.append(k)

### Combining all the stop words
stop_words = stopwords.words('english')
stop_words.extend(high_freq_words)
stop_words.extend(less_freq_words)

### 2.3 Bag of  Words Features

In [35]:
bow_model = CountVectorizer(ngram_range= (1,2),stop_words=stop_words)
Para_text_bow = bow_model.fit_transform(text_data_sentence_final_2.ParagraphText)

#### 2.3.2 Tf-IDF

In [36]:
tf_idf_model = TfidfVectorizer(ngram_range=(1,2),stop_words = stop_words)
Para_text_tfidf = tf_idf_model.fit_transform(text_data_sentence_final.ParagraphText)

### 2.4 Combining the bag of words with other features

In [38]:
### Getting sentiment and emotional feautres

emotional_features = text_data_sentence_final_2.loc[:,'sadness':'score']
emotional_features = emotional_features.fillna(0)

## Converting Sentiment to codes
sentiment_cat_codes = pd.DataFrame(text_data_sentence_final_2['sentiment'].astype('category').cat.codes).values.shape

#### Getting the encoded version of IBM features
text_data_ibm_features = text_data_sentence_final_2.loc[:,'Anatomy':'Vehicle'].fillna(' ')
text_data_ibm_features_codes = text_data_ibm_features.apply(lambda x: x.astype('category').cat.codes)

In [58]:
#Para_text_bow,
X = sp.sparse.hstack((Para_text_bow,
                      sentiment_cat_codes,
                      emotional_features,
                      text_data_ibm_features_codes.values) ,format='csr')

## 3. Model Creation 

### 3.1 Splitting data into train and test

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X ,text_data_sentence_final_response,
                                                    test_size = 0.3, random_state = 0)

In [61]:
features = bow_model.get_feature_names()
features.extend(['Sentiment'])
features.extend(emotional_features.columns.values)
features.extend(text_data_ibm_features_codes.columns.values)

### 3.2 Machine Learning Models

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [63]:
#knn_model = KNeighborsClassifier(n_neighbors= 3, p = 1.5)
rf_model = RandomForestClassifier(n_estimators= 50)
lr_model = LogisticRegression(multi_class='multinomial')

In [64]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### 3.3 Analyzing the Response

In [47]:
pd.DataFrame([np.argmax(each) if each.sum() != 0 else 'All zeros' for each in rf_model.predict(X_test)])[0].value_counts()

All zeros    1430
1             118
0              59
2               5
9               4
5               4
7               3
4               3
3               3
6               2
Name: 0, dtype: int64

In [66]:
from sklearn.metrics import confusion_matrix

### Storing the Prediction for confusion matrix
preds = pd.DataFrame([np.argmax(each) if each.sum() != 0 else 10 for each in rf_model.predict(X_test)])[0]

In [69]:
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                  index = features,
                                  columns=['importance']).sort_values('importance',ascending=False)

feature_importances.head(10)

Unnamed: 0,importance
joy,0.007274
Location,0.007007
sadness,0.006532
Person,0.006273
anger,0.005881
fear,0.005751
disgust,0.005751
score,0.005159
london,0.004046
prince,0.002984


In [70]:
np.array(rf_model.predict_proba(X_test)).shape

(10, 1631, 2)

#### Getting the result by changing the threshold

In [73]:
result = [np.argmax([int(each[1]>0.4) for each in np.array(rf_model.predict_proba(X_test))[:,each_example,:]]) if \
     sum([int(each[1]>0.4) for each in np.array(rf_model.predict_proba(X_test))[:,each_example,:]]) > 0 else 10
            for each_example in range(1000)]

In [72]:
pd.DataFrame(result)[0].value_counts()

10    585
1     270
0     103
4      11
6       6
5       6
3       5
2       5
9       3
8       3
7       3
Name: 0, dtype: int64

In [205]:
y_test.melt()[y_test.melt().value.isin([1])].variable.value_counts()

city_locationSetting            328
conversation_type               302
husband_agentType               295
father_agentType                206
royalty, male_agentType         198
nobleman, named_agentType       196
writing letter_type             189
famous man_agentType            178
lover, male, named_agentType    168
sovereign, male_agentType       166
Name: variable, dtype: int64