In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers, CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report
import pandas as pd
import numpy as np
import imp, sys, os
sys.path.append('./src/')
import feature_utils, defines
import seaborn as sns
import glob

In [3]:
vec_dim = 300

In [4]:
def get_and_save_doc_vectors(doc_idx,dim): 
    sent_db = pd.read_csv(os.path.join(os.getcwd(),defines.PATH_TO_DFS,"{:02d}_sent_db.csv".format(doc_idx)))
    sent_vec_db = feature_utils.get_vector_per_sentence(sent_db,vec_dim)
    sent_vec_db.to_csv(os.path.join(os.getcwd(),defines.PATH_TO_DFS,"{:02d}_sent_vec{}_db.csv".format(doc_idx,dim)),index=False)
    print("{} doc sent saved".format(doc_idx,dim))
    
doc_db_list = glob.glob(os.path.join(os.path.join(os.getcwd(),defines.PATH_TO_DFS,"*_sent_db.csv")))
for i,doc in enumerate(doc_db_list):
    doc_prefix = os.path.basename(doc).split("_")[0]
    get_and_save_doc_vectors(int(doc_prefix),vec_dim)          
    
                       

2 doc sent saved
4 doc sent saved
6 doc sent saved
5 doc sent saved
7 doc sent saved
8 doc sent saved
1 doc sent saved
3 doc sent saved


## See average length of nar/non-narrative

In [5]:
sent_db = pd.concat(map(pd.read_csv, glob.glob(os.path.join('dataframes', "*_sent_db.csv"))))

In [6]:
sent_db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3052 entries, 0 to 151
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   text               3052 non-null   object 
 1   sent_idx_in_block  3052 non-null   float64
 2   block_idx          3052 non-null   float64
 3   is_nar             3052 non-null   float64
 4   doc_idx            3052 non-null   float64
 5   par_db_idx         3052 non-null   float64
 6   par_idx_in_doc     3052 non-null   float64
 7   par_type           3052 non-null   object 
 8   block_type         3052 non-null   object 
 9   nar_idx            3052 non-null   float64
 10  sent_len           3052 non-null   float64
 11  sent_idx_in_nar    1038 non-null   float64
 12  nar_len_in_sent    1038 non-null   float64
 13  sent_idx_out_nar   2014 non-null   float64
 14  fist_sent_in_nar   3052 non-null   bool   
 15  last_sent_in_nar   3052 non-null   bool   
 16  is_client          3052 n

### Calculate similarity between all sentences within same doc

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

def get_and_save_doc_similarity(doc_idx,vec_dim): 
    sent_vec_db = pd.read_csv(os.path.join(os.getcwd(),defines.PATH_TO_DFS,"{:02d}_sent_vec{}_db.csv".format(doc_idx,vec_dim)))
    sim_db = pd.DataFrame(cosine_similarity(sent_vec_db))
    sim_db.to_csv(os.path.join(os.getcwd(),defines.PATH_TO_DFS,"{:02d}_sent_sim_vec{}_db.csv".format(doc_idx,vec_dim)),index=False)
    print("{} sim_db sent saved".format(doc_idx))

In [15]:
doc_db_list = glob.glob(os.path.join(os.path.join(os.getcwd(),defines.PATH_TO_DFS,"*_sent_db.csv")))
for i,doc in enumerate(doc_db_list):
    doc_prefix = os.path.basename(doc).split("_")[0]
    get_and_save_doc_similarity(int(doc_prefix),vec_dim) 

2 sim_db sent saved
4 sim_db sent saved
6 sim_db sent saved
5 sim_db sent saved
7 sim_db sent saved
8 sim_db sent saved
1 sim_db sent saved
3 sim_db sent saved


In [143]:
# Feature set
def sent2features(sent_idx): 
    sent_vec = get_sent_vec(sent_idx)
    features = {}
    for idx,val in enumerate(sent_vec):
        features["v{}".format(idx)] = val
    
    #BOS - beginning
    #EOC - end
    #OS - out
    if(sent_idx%seq_len == 0):
        features['BOS'] = True # sent_is_first_in_nar(sent_idx)
    if(sent_idx%seq_len == seq_len-1):
        features['EOS'] = True # sent_is_last_in_nar(sent_idx)
#     features['OS'] = sent_is_out_nar(sent_idx)
    
#     if sent_idx > 1:
#         features.update({
#              '-1:one_before_is_nar': word1.lower(),
#         })
#     if sent_idx > 2 :
#         features.update({
#            '-1:one_before_is_nar': word1.lower(),
#            '-2:two_before_is_nar': word1.lower(),
#         })

    return features

In [186]:
def sent2labels(sent_idx):
    return "nar" if get_sent_label(sent_idx) == 1 else "not_nar"

In [193]:
def seq2features(seq_idx):
    seq = []
    for s in range(seq_idx,seq_idx+seq_len):
         seq.extend(sent2features(s))
    return seq

def seq2labels(seq_idx):
    seq = []
    for s in range(seq_idx,seq_idx+seq_len):
        label = sent2labels(s)
        seq.extend([label for i in range(vec_dim)]) # 2 - for 2 more featues EOS and BOS
    return seq

In [194]:
X = [seq2features(s) for s in indices_for_sequence]
y = [seq2labels(s) for s in indices_for_sequence]

In [196]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)