In this part I prepare feature descriptions for each word in the text corpus. These extracted features will be used in building a model for predicting if there is a EDU-break between the elements of a 4-Gram. Below are the features I’m extracting:

(a) The **part-of-speech tag**,

(b) The **top syntactic tag** of the largest syntactic constituent starting from or ending with a word,

(c) The **depth** of the largest syntactic constituent starting from or ending with a word,

(d) **Position of a word in a sentence**.

POS-tags and top-syntactic tags are one-hot-encoded. The depth and the position are normalized by dividing them by the depth of the tree and the length of the sentence respectively.
extract_features

In [1]:
!pip install --upgrade pandas

Collecting pandas
  Downloading pandas-0.22.0-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (14.9MB)
[K    100% |████████████████████████████████| 14.9MB 77kB/s eta 0:00:01
[?25hCollecting numpy>=1.9.0 (from pandas)
  Downloading numpy-1.14.1-cp36-cp36m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (4.7MB)
[K    100% |████████████████████████████████| 4.7MB 244kB/s ta 0:00:011
[?25hCollecting pytz>=2011k (from pandas)
  Downloading pytz-2018.3-py2.py3-none-any.whl (509kB)
[K    100% |████████████████████████████████| 512kB 1.7MB/s ta 0:00:01
[?25hRequirement already up-to-date: python-dateutil>=2 in /Users/YK/anaconda3/lib/python3.6/site-packages (from pandas)
Requirement already up-to-date: six>=1.5 in /Users/YK/anaconda3/lib/python3.6/site-packages (from python-dateutil>=2->pandas)
Installing collected packages: numpy, pytz, pandas
  Found existing installation: nu

In [None]:
!pip install gensim

In [156]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.19.5-py2.py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 1.9MB/s ta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.19.5


In [2]:
import os

In [3]:
import nltk
from nltk import word_tokenize
import gensim
import pandas as pd
from tqdm import tqdm

In [193]:
# constructing the tree: string to sequence of its elements
def split(tree_str):
    chunks = tree_str.split()
    chunk_elements = []
    for chunk in chunks:
        if chunk[0] == "(":
            index_i = 0
            while chunk[index_i] == "(":
                chunk_elements.append("(")
                index_i += 1
            chunk_elements.append(chunk[index_i:])
        else:
            index_j = (-1)
            while chunk[index_j] == ")":
                chunk_elements.append(")")
                index_j += (-1)
            chunk_elements.insert((index_j + 1), chunk[0:((index_j)+1)])
    return(chunk_elements)

In [264]:
# generating constituents
def constituents(tree_str):
# nt = non-terminal
    nt_seq = []
    term_counter = 0
    constituents_set = set()
    parsed_str = split(tree_str)
    element_index = 0
    none_depth = 0
    depth = 0
    while element_index < len(parsed_str):
        if parsed_str[element_index] == "(":
            nt = parsed_str[element_index + 1]
            depth += 1
            if nt == "-NONE-":
# we are entering a subtree beginning with -NONE-                
                none_depth += 1
                nt_seq.append((nt, term_counter + 1))
            else:
# taking first of the nt-label
                if len(nt) > 0 and nt[0] == "-":
                    first_nt_label = nt
                else:
                    first_nt_label = nt.split("-")[0]
                nt_seq.append((first_nt_label, term_counter + 1))
#  skipping a non-terminal           
            element_index += 2
        elif parsed_str[element_index] == ")":
            last_nt = nt_seq.pop()
            depth -= 1
            if last_nt[0] == "-NONE-":
# exiting a subtree beginning with -NONE-                
                none_depth -= 1
            else:
                if last_nt[1] <= term_counter:
                    if last_nt[0] == "ROOT" or last_nt[0] =="TOP":
                        pass
                    else:
                        constituents_set.add((last_nt[0], last_nt[1], term_counter, depth))
                else:
# In this case all terminals in the subtree were ignored, which means that the subtree contained nothing but traces, 
# therefore we're skipping the constituent.
                    pass
            element_index += 1
        else:
            if none_depth == 0:
# if we are not in a subtree beginning with -NONE-, then we count the terminal  
                term_counter += 1
#                 print(term_counter, parsed_str[element_index])
            else:
# otherwise we ignore the terminal            
                pass
            element_index += 1
    return constituents_set

#### Here is the main function:

In [235]:
def extract_features(file_name, word2vec_model, subfolder):
    # Reading the EDU file
    RST_file = open(os.path.join("../nlp_project/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/" + subfolder,
                                 file_name) + ".edus")
    lines = RST_file.readlines()
    RST_file.close()
    
    # tokenizing the EDUs
    tokenized_edus = []
    for edu in lines:
        tokenized_edu = []
        for token in word_tokenize(edu):
            if token == "(":
                tokenized_edu.append("OPENING_ROUND_BRACE")
            elif token == ")":
                tokenized_edu.append("CLOSING_ROUND_BRACE")
            else:
                tokenized_edu.append(token)
        tokenized_edus.append(tokenized_edu)
        
    # Remember boundary indices, combine EDUs of a sentence
    boundary_indices = [] 
    edu_boundary = []
    sentences = []
    sentence = []
    for edu in tokenized_edus:
        if (edu[-1] not in ["!", "?", "."])\
                and (edu[-2:] not in ["!\"", "?\"", ".\"", "!'", "?'", ".'"])\
                and (edu[-3:] not in ["!''", "?''", ".''"]):
            sentence.extend(edu)
            edu_boundary.append(len(sentence) - 1)
        else:
            sentence.extend(edu)
            sentences.append(sentence)
            edu_boundary.append(len(sentence) - 1)
            boundary_indices.append(edu_boundary)
            sentence = []
            edu_boundary = []
            
    # getting a POS-tag from nltk
    pos_sentences = []
    for sentence in sentences:
        pos_sentence = nltk.pos_tag(sentence)
        pos_sentences.append(pos_sentence)
    
    # writing the text in a file for syntactic parsing
    text_file = open("edu_segmentation/" + file_name + ".text", "w")
    text_file.write("\n".join([" ".join(sentence) for sentence in sentences]))
    text_file.close()

    os.environ["FILE_NAME"] = file_name

    # performing syntactic parsing
    os.system("./stanford-parser-full-2017-06-09/lexparser.sh edu_segmentation/\"$FILE_NAME\".text > edu_segmentation/\"$FILE_NAME\".penn")

    with open("edu_segmentation/" + file_name + ".penn") as file:
        trees_string = file.read()
        trees = trees_string.split("\n\n")
    
    one_word_constituents = []
    for tree in trees:
        tree_dict = {}
        for constituent in constituents(tree):
            if constituent[1] not in tree_dict:
                tree_dict[constituent[1]] = ([],[])
            tree_dict[constituent[1]][0].append(constituent)
            if constituent[2] not in tree_dict:
                tree_dict[constituent[2]] = ([],[])
            second_constituent = tree_dict[constituent[2]][1].append(constituent)
        one_word_constituents.append(tree_dict)
        
    # computing the top syntactic tag
    top_tags = []
    # creating a new list (it will contain dictionaries with top syntactic tags)
    for tree_dict in one_word_constituents:
        tags_dict = {}
        for key, value in tree_dict.items(): # e.g. 1 --> key, ([("NP", 1, 6), ("S", 1, 44)], [("NNP", 1, 1)]) -->value
            top_tag_begin = max([(element[2], element[0]) for element in value[0]])[1]
            top_tag_end = min([(element[1], element[0]) for element in value[1]])[1]

            tags_dict.update({key: (top_tag_begin, top_tag_end)})
        top_tags.append(tags_dict)
    
#     Adding the Depth to the list with Top Syntactic Tags
    top_depths = []
    for tree_dict in one_word_constituents:
        tags_dict = {}
        for key, value in tree_dict.items():
            top_tag_begin_depth = max([(element[2], element[3]) for element in value[0]])[1]
            top_tag_end_depth = min([(element[1], element[3]) for element in value[1]])[1]

            tags_dict.update({key: (top_tag_begin_depth, top_tag_end_depth)})
        top_depths.append(tags_dict)
    
    # enumerating tokens of the sentence and iterating over them 
    
    data = []
    
    #checking if there is a edu break after the token
    
    for (sentence_no, pos_sentence) in enumerate(pos_sentences):
        for (token_no, token) in enumerate(pos_sentence): #token_no = key
            if token in boundary_indices[sentence_no]: # taking boundary indices of a corresponding sentence
                edu_break = True
            else:
                edu_break = False

    # take a top_tag_beg from top_tags and add it to the table with sent_no
            top_tag_beg = top_tags[sentence_no][token_no +1][0] #because we've counted them starting from 1
            top_tag_end = top_tags[sentence_no][token_no +1][1]
            depth_beg = top_depths[sentence_no][token_no +1][0]
            depth_end = top_depths[sentence_no][token_no +1][1] 

    # get a vector
            word = token[0]
            POS_tag = token[1]

    # forming a tuple with features

            feature_vectors = []

            feature_vectors.append(file_name)
            feature_vectors.append(sentence_no)
            feature_vectors.append(token_no)
            feature_vectors.append(word)
            feature_vectors.append(edu_break)
            feature_vectors.append(POS_tag)
            feature_vectors.append(top_tag_beg)
            feature_vectors.append(top_tag_end)
            feature_vectors.append(depth_beg)
            feature_vectors.append(depth_end)

            if word in model.vocab:
                word_vector = model.get_vector(word)
                feature_vectors.extend(list(word_vector))
            else:
                 feature_vectors.extend([0]*300)

            data.append(feature_vectors)
            
    df = pd.DataFrame(data, columns=["file_name", 
                                 "sentence_no", 
                                 "token_no", 
                                 "word", 
                                 "edu_break", 
                                 "POS-tag", 
                                 "top_tag_beg", 
                                 "top_tag_end", 
                                 "depth_beg", 
                                 "depth_end"] + ["c_%.3d" % i for i in range(300)])
    
    df.to_csv("edu_segmentation/" + file_name + ".csv", index=False)
    
    return df

In [237]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [222]:
# Getting a list of RST files
def get_file_names(ls_result, target_ext):
    file_names = []
    
    for fn in ls_result:
        base_name, ext = os.path.splitext(fn)
        if ext == target_ext:
            file_names.append(base_name)
                
    return file_names

In [260]:
# computing features in each file
def compute_features(subfolder, file_names):
    for file_name in tqdm(file_names, total=len(file_names)):
        extract_features(file_name, model, subfolder)

In [14]:
# creating a csv file for saving results
def generate_csv(subfolder):
    ls_result = os.listdir("rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/" + subfolder)
    file_names = get_file_names(ls_result)
    compute_features(subfolder, file_names)

## Example of generating features for one file

In [265]:
compute_features("TRAINING", ["wsj_0603.out"])

100%|██████████| 1/1 [00:06<00:00,  6.02s/it]


In [267]:
# files starting with c_ are the components of a word vector
pd.read_csv("edu_segmentation/wsj_0603.out.csv").head()

Unnamed: 0,file_name,sentence_no,token_no,word,edu_break,POS-tag,top_tag_beg,top_tag_end,depth_beg,depth_end,...,c_290,c_291,c_292,c_293,c_294,c_295,c_296,c_297,c_298,c_299
0,wsj_0603.out,0,0,THE,False,DT,S,DT,1,5,...,-0.012695,-0.141602,-0.118164,0.081543,-0.279297,-0.519531,-0.099609,-0.032715,0.148438,0.012573
1,wsj_0603.out,0,1,FINANCIAL,False,NNP,NNP,NNP,5,5,...,-0.064941,-0.056152,-0.110352,-0.079102,-0.067871,-0.068359,-0.265625,0.19043,0.071289,0.349609
2,wsj_0603.out,0,2,ACCOUNTING,False,NNP,NNP,NNP,5,5,...,-0.072266,-0.070312,-0.1875,0.072754,-0.230469,-0.172852,-0.244141,0.176758,-0.059082,-0.05249
3,wsj_0603.out,0,3,STANDARDS,False,NNP,NNPS,NNPS,5,5,...,-0.076172,-0.25,-0.165039,-0.05957,-0.316406,-0.291016,-0.125,0.167969,0.150391,-0.095703
4,wsj_0603.out,0,4,BOARD,False,NNP,NNP,NNP,5,5,...,-0.150391,-0.172852,0.100586,0.208008,-0.149414,-0.375,-0.330078,0.063965,0.071289,-0.097168


In [None]:
import os
import pandas as pd

In [168]:
def add_rel_feature(vectorized_table, col_name):
    vectorized_table.loc[:, "sent_max_" + col_name] = vectorized_table.groupby(["file_name", "sentence_no"]) \
                                                        [col_name].transform("max")
    vectorized_table.loc[:, "rel_" + col_name] = (vectorized_table[col_name] \
                                / vectorized_table["sent_max_" + col_name] - 0.5) * 2

In [238]:
def create_table(path, file_names, ohe=True, sent_start_no=0):    
    dfs = []
    for file in file_names:
        df = pd.read_csv(os.path.join(path, file + ".csv")) # for safety if there's a / at the end of the path
        dfs.append(df)

    df = pd.concat(dfs)
    
    if ohe:
    
    
    # transforming into vector form
        vectorized_table = pd.get_dummies(df, 
                                          columns=["POS-tag", 
                                                   "top_tag_beg", 
                                                   "top_tag_end"])
    else:
        vectorized_table = df

    sent_global_no_df = vectorized_table[["file_name", "sentence_no"]].drop_duplicates().sort_values(by="file_name")
    sent_global_no_df.loc[:, "sentence_global_no"] = range(sent_start_no,
                                                           sent_start_no + len(sent_global_no_df))
    
    vectorized_table = pd.merge(vectorized_table,
                                sent_global_no_df,
                                on=["file_name", "sentence_no"])

    
    # adding a column with sentence index + word index
    vectorized_table.loc[:, "sent_word_indices"] = vectorized_table.apply(lambda row: str(row.sentence_global_no) + "_" + str(row.token_no),
                                                                          axis=1)

    vectorized_table = vectorized_table[["word"] + [c for c in vectorized_table.columns if c != "word"]]
    
    vectorized_table.loc[:, "depth_beg"] = vectorized_table.depth_beg - 1
    vectorized_table.loc[:, "depth_end"] = vectorized_table.depth_end - 1
    
    # adding relative features to the table
    
    add_rel_feature(vectorized_table, "depth_beg")
    add_rel_feature(vectorized_table, "depth_end")
    add_rel_feature(vectorized_table, "token_no")

    vectorized_table = vectorized_table.drop(["sentence_no", 
                                              "sentence_global_no",
                                              "depth_beg",
                                              "depth_end",
                                              "token_no", 
                                              "edu_break",
                                              "sent_max_depth_beg",
                                              "sent_max_depth_end",
                                              "sent_max_token_no"], axis=1)
    vectorized_table = vectorized_table.drop([c for c in vectorized_table.columns if c.split("_", 1)[0] == "c"],
                                             axis=1)

    return vectorized_table

In [65]:
train_ls_result = os.listdir("rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TRAINING/")
train_file_names = get_file_names(train_ls_result, ".rst")
test_ls_result = os.listdir("rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/TEST/")
test_file_names = get_file_names(test_ls_result, ".rst")

In [76]:
# creating a table for Training and Test files
# keeping test and training files in one table, dividing it into two tables (so that they have the same # of columns)
file_name_df = pd.DataFrame({"file_name": train_file_names + test_file_names,
                             "is_train": [True]*len(train_file_names) + [False]*len(test_file_names)})

In [246]:
# Dividing a table into training and test files

vectorized_table = create_table("edu_segmentation/", 
                                train_file_names + test_file_names)

vectorized_table__split = pd.merge(vectorized_table,
                                   file_name_df,
                                   on="file_name")

train_data_set = vectorized_table__split.loc[vectorized_table__split.is_train == True] \
                                        .drop("is_train", axis=1)
test_data_set = vectorized_table__split.loc[vectorized_table__split.is_train == False] \
                                        .drop("is_train", axis=1)

train_data_set.to_csv("data_set_1__train.csv", index=False) # 1 - when test and train.files are processed together
test_data_set.to_csv("data_set_1__test.csv", index=False)

In [247]:
train_table = create_table("edu_segmentation/", train_file_names, False)

test_table = create_table("edu_segmentation/", 
                          test_file_names, 
                          False,
                          1 + max([int(sw.split("_")[0]) \
                                   for sw in train_table.sent_word_indices]))

train_table.loc[:, "is_train"] = [True] * len(train_table)
test_table.loc[:, "is_train"] = [False] * len(test_table)

table = pd.concat([train_table, test_table])

vectorized_table_2 = pd.get_dummies(table,
                                    columns=["POS-tag", 
                                             "top_tag_beg", 
                                             "top_tag_end"])

vectorized_table_2 = vectorized_table_2[["sent_word_indices",
                                         "word",
                                         "file_name"]
                                        + [c for c in vectorized_table_2.columns \
                                              if c not in ["sent_word_indices",
                                                           "word",
                                                           "file_name",
                                                           "rel_depth_beg",
                                                           "rel_depth_end",
                                                           "rel_token_no"]]
                                        + ["rel_depth_beg",
                                           "rel_depth_end",
                                           "rel_token_no"]]

train_data_set_2 = vectorized_table_2.loc[vectorized_table_2.is_train == True] \
                                        .drop("is_train", axis=1)
test_data_set_2 =  vectorized_table_2.loc[vectorized_table_2.is_train == False] \
                                        .drop("is_train", axis=1)

train_data_set_2.to_csv("data_set_2__train.csv", index=False)
test_data_set_2.to_csv("data_set_2__test.csv", index=False)

In [250]:
pd.read_csv("data_set_2__test.csv").set_index("sent_word_indices").head(39)

Unnamed: 0_level_0,word,file_name,POS-tag_#,POS-tag_$,POS-tag_'',POS-tag_(,POS-tag_),"POS-tag_,",POS-tag_.,POS-tag_:,...,top_tag_end_WHNP,top_tag_end_WHPP,top_tag_end_WP,top_tag_end_WP$,top_tag_end_WRB,top_tag_end_X,top_tag_end_``,rel_depth_beg,rel_depth_end,rel_token_no
sent_word_indices,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7103_0,Friday,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1.0,-0.666667,-1.0
7103_1,",",wsj_0602.out,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,-0.666667,-0.666667,-0.928571
7103_2,October,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.666667,0.0,-0.857143
7103_3,27,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-0.785714
7103_4,",",wsj_0602.out,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.0,0.0,-0.714286
7103_5,1989,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,-0.333333,-0.642857
7103_6,The,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-0.333333,0.333333,-0.571429
7103_7,key,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.333333,0.333333,-0.5
7103_8,U.S.,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.333333,0.666667,-0.428571
7103_9,and,wsj_0602.out,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.666667,0.666667,-0.357143
