In [None]:
!pip install gensim

In [156]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.19.5-py2.py3-none-any.whl (51kB)
[K    100% |████████████████████████████████| 61kB 1.9MB/s ta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.19.5


In [130]:
import os

In [157]:
import nltk
from nltk import word_tokenize
import gensim
import pandas
from tqdm import tqdm

In [202]:
# constructing the tree: string to sequence of its elements
def split(tree_str):
    chunks = tree_str.split()
    chunk_elements = []
    for chunk in chunks:
        if chunk[0] == "(":
            index_i = 0
            while chunk[index_i] == "(":
                chunk_elements.append("(")
                index_i += 1
            chunk_elements.append(chunk[index_i:])
        else:
            index_j = (-1)
            while chunk[index_j] == ")":
                chunk_elements.append(")")
                index_j += (-1)
            chunk_elements.insert((index_j + 1), chunk[0:((index_j)+1)])
    return(chunk_elements)

In [203]:
# generating constituents
def constituents(tree_str):
# nt = non-terminal
    nt_seq = []
    term_counter = 0
    constituents_set = set()
    parsed_str = split(tree_str)
    element_index = 0
    none_depth = 0
    depth = 0
    while element_index < len(parsed_str):
        if parsed_str[element_index] == "(":
            nt = parsed_str[element_index + 1]
            depth += 1
            if nt == "-NONE-":
# we are entering a subtree beginning with -NONE-                
                none_depth += 1
                nt_seq.append((nt, term_counter + 1))
            else:
# taking first of the nt-label
                if len(nt) > 0 and nt[0] == "-":
                    first_nt_label = nt
                else:
                    first_nt_label = nt.split("-")[0]
                nt_seq.append((first_nt_label, term_counter + 1))
#  skipping a non-terminal           
            element_index += 2
        elif parsed_str[element_index] == ")":
            last_nt = nt_seq.pop()
            depth -= 1
            if last_nt[0] == "-NONE-":
# exiting a subtree beginning with -NONE-                
                none_depth -= 1
            else:
                if last_nt[1] <= term_counter:
                    if last_nt[0] == "ROOT" or last_nt[0] =="TOP":
                        pass
                    else:
                        constituents_set.add((last_nt[0], last_nt[1], term_counter, depth))
                else:
# In this case all terminals in the subtree were ignored, which means that the subtree contained nothing but traces, 
# therefore we're skipping the constituent.
                    pass
            element_index += 1
        else:
            if none_depth == 0:
# if we are not in a subtree beginning with -NONE-, then we count the terminal                
                term_counter += 1
            else:
# otherwise we ignore the terminal            
                pass
            element_index += 1
    return constituents_set

In [276]:
def extract_features(file_name, word2vec_model, subfolder):
    # Reading the EDU file
    RST_file = open(os.path.join("../nlp_project/rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/" + subfolder,
                                 file_name) + ".edus")
    lines = RST_file.readlines()
    RST_file.close()
    
    # tokenizing the EDUs
    tokenized_edus = []
    for edu in lines:
        tokenized_edu = []
        for token in word_tokenize(edu):
            if token == "(":
                tokenized_edu.append("OPENING_ROUND_BRACE")
            elif token == ")":
                tokenized_edu.append("CLOSING_ROUND_BRACE")
            else:
                tokenized_edu.append(token)
        tokenized_edus.append(tokenized_edu)
        
    # Remember boundary indices, combine EDUs of a sentence
    boundary_indices = [] 
    edu_boundary = []
    sentences = []
    sentence = []
    for edu in tokenized_edus:
        if (edu[-1] not in ["!", "?", "."])\
                and (edu[-2:] not in ["!\"", "?\"", ".\"", "!'", "?'", ".'"])\
                and (edu[-3:] not in ["!''", "?''", ".''"]):
            sentence.extend(edu)
            edu_boundary.append(len(sentence) - 1)
        else:
            sentence.extend(edu)
            sentences.append(sentence)
            edu_boundary.append(len(sentence) - 1)
            boundary_indices.append(edu_boundary)
            sentence = []
            edu_boundary = []
            
    # getting a POS-tag from nltk
    pos_sentences = []
    for sentence in sentences:
        pos_sentence = nltk.pos_tag(sentence)
        pos_sentences.append(pos_sentence)
        
    # writing a text in conll format
#     conll_file = open("edu_segmentation/" + file_name + ".conll", "w")
    
#     for pos_sentence in pos_sentences:
#         for word, pos in pos_sentence:
#             conll_file.write(word + " " + pos + "\n")
#         conll_file.write("\n")

#     conll_file.close()
    
    # writing the text in a file for syntactic parsing
    text_file = open("edu_segmentation/" + file_name + ".text", "w")
    text_file.write("\n".join([" ".join(sentence) for sentence in sentences]))
    text_file.close()

    os.environ["FILE_NAME"] = file_name

    # performing syntactic parsing
    os.system("./stanford-parser-full-2017-06-09/lexparser.sh edu_segmentation/\"$FILE_NAME\".text > edu_segmentation/\"$FILE_NAME\".penn")

    with open("edu_segmentation/" + file_name + ".penn") as file:
        trees_string = file.read()
        trees = trees_string.split("\n\n")
    
    one_word_constituents = []
    for tree in trees:
        tree_dict = {}
        for constituent in constituents(tree):
            if constituent[1] not in tree_dict:
                tree_dict[constituent[1]] = ([],[])
            tree_dict[constituent[1]][0].append(constituent)
            if constituent[2] not in tree_dict:
                tree_dict[constituent[2]] = ([],[])
            second_constituent = tree_dict[constituent[2]][1].append(constituent)
        one_word_constituents.append(tree_dict)
        
    # next we will compute the top syntactic tag
    top_tags = []
    # create a new list (it will contain dictionaries with top syntactic tags)
    for tree_dict in one_word_constituents:
        tags_dict = {}
        for key, value in tree_dict.items(): # e.g. 1 --> key, ([("NP", 1, 6), ("S", 1, 44)], [("NNP", 1, 1)]) -->value
            top_tag_begin = max([(element[2], element[0]) for element in value[0]])[1]
            top_tag_end = min([(element[1], element[0]) for element in value[1]])[1]

            tags_dict.update({key: (top_tag_begin, top_tag_end)})
        top_tags.append(tags_dict)
    
#     Adding the Depth to the list with Top Syntactic Tags
    top_depths = []
    for tree_dict in one_word_constituents:
        tags_dict = {}
        for key, value in tree_dict.items():
            top_tag_begin_depth = max([(element[2], element[3]) for element in value[0]])[1]
            top_tag_end_depth = min([(element[1], element[3]) for element in value[1]])[1]

            tags_dict.update({key: (top_tag_begin_depth, top_tag_end_depth)})
        top_depths.append(tags_dict)
    
    #enumerate tokens of the sentence (use *enumerate*) and iterate over them 
    # (the current token will be the middle left 4-gram element).
    data = []
    #check if there is a edu break after the token
    for (sentence_no, pos_sentence) in enumerate(pos_sentences):
        for (token_no, token) in enumerate(pos_sentence): #token_no = key
            if token in boundary_indices[sentence_no]: # taking boundary indices of a corresponding sentence
                edu_break = True
            else:
                edu_break = False

    # take a top_tag_beg from top_tags and add it to the table with sent_no
            top_tag_beg = top_tags[sentence_no][token_no +1][0] #because we've counted them starting from 1
            top_tag_end = top_tags[sentence_no][token_no +1][1]
            depth_beg = top_depths[sentence_no][token_no +1][0]
            depth_end = top_depths[sentence_no][token_no +1][1] 

    # get a vector
            word = token[0]
            POS_tag = token[1]

    # form a tuple with features which i'll put to the data

            feature_vectors = []

            feature_vectors.append(file_name)
            feature_vectors.append(sentence_no)
            feature_vectors.append(token_no)
            feature_vectors.append(word)
            feature_vectors.append(edu_break)
            feature_vectors.append(POS_tag)
            feature_vectors.append(top_tag_beg)
            feature_vectors.append(top_tag_end)
            feature_vectors.append(depth_beg)
            feature_vectors.append(depth_end)

            if word in model.vocab:
                word_vector = model.get_vector(word)
                feature_vectors.extend(list(word_vector))
            else:
                feature_vectors.extend([0]*300)

            data.append(feature_vectors)
            
    df = pd.DataFrame(data, columns=["file_name", 
                                 "sentence_no", 
                                 "token_no", 
                                 "word", 
                                 "edu_break", 
                                 "POS-tag", 
                                 "top_tag_beg", 
                                 "top_tag_end", 
                                 "depth_beg", 
                                 "depth_end"] + ["c_%.3d" % i for i in range(300)])
    
    df.to_csv("edu_segmentation/" + file_name + ".csv", index=False)
    
    return df

In [None]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [257]:
def get_file_names(ls_result, target_ext):
    file_names = []
    
    for fn in ls_result:
        ext = fn[-4:]
        if ext == target_ext:
            file_names.append(fn[:-4])
                
    return file_names

In [260]:
def compute_features(subfolder, file_names):
    for file_name in tqdm(file_names, total=len(file_names)):
        extract_features(file_name, model, subfolder)

In [None]:
def generate_csv(subfolder):
    ls_result = os.listdir("rst_discourse_treebank/data/RSTtrees-WSJ-main-1.0/" + subfolder)
    file_names = get_file_names(ls_result)
    compute_features(subfolder, file_names)

In [None]:
ls_result = os.listdir("edu_segmentation/")
csv_file_names = get_file_names(ls_result, ".csv")

import pandas as pd

dfs = []
for file in csv_file_names:
    df = pd.read_csv("edu_segmentation/" + file + ".csv")
    dfs.append(df)

df = pd.concat(dfs)

In [303]:
vectorized_table = pd.get_dummies(df, columns=["token_no", "POS-tag", "top_tag_beg", "top_tag_end", "depth_beg", "depth_end",])
vectorized_table.to_csv("data_set.csv", index=False)