In [12]:
import pandas as pd
from pathlib import Path
import os
import re
import pickle
import unidecode
from collections import Counter

PATH = 'C:\\Users\\sanne\\Documents\\Master\\Applied Text Mining\\System\\'
VADER = 'Sent_polarity3\\'
ALLEN = 'conll-allen\\conll-allen-nlp\\'
OPINION = 'Opinions\\output\\'

In [13]:
def get_relevant_sent_ids():
    '''Loading output of script saved as a picklefile. This script extracts relevant frames using FrameNet
    from the sentences provided in the NAF files'''   
    all_relevant_sent_ids = pickle.load(open(PATH + "all_relevant_sent_ids.pickle", "rb"))
    for key in all_relevant_sent_ids:
        all_relevant_sent_ids[key.replace('&', '_')] = all_relevant_sent_ids.pop(key) #necessary for grouping function
    for key in all_relevant_sent_ids:
        all_relevant_sent_ids[key.replace('\'', '_')] = all_relevant_sent_ids.pop(key) #necessary for grouping function
    return all_relevant_sent_ids

def change_format_for_grouping(data):
    data.reset_index(drop=True, inplace=True)
    data.reset_index(inplace=True)
    data.index = data.index.set_names(['ind'])
    return data

In [48]:
def load_files(foldername, delimiter= ',', make_dict = 'no'): 
    '''loads CSV files of given folder and returns it as a dataframe
    
    Keyword argument = make_dict, default = no. If yes then a dictionary is made with the identifiers as keys
    and the words as values. This dictionary is ascribed as value to the filename as key in a all_identifiers_dict
    
    Returns the dataframe of all files and dictionary of all files'''
    total_data = pd.DataFrame()
    basepath = Path(PATH + foldername)
    opinionfiles_in_basepath = basepath.iterdir()
    all_identifiers_dict = dict()
    for path_to_file in opinionfiles_in_basepath:
        if path_to_file.is_file():  # check if the item is not a subdirectory
            data = pd.read_csv(path_to_file, encoding = 'ANSI', error_bad_lines=False, sep=delimiter)
            filename = os.path.split(path_to_file)[-1]    
            filename = filename.split('.')[0]
            if filename.endswith('-opinions'):
                filename = filename[:-9]
            data['file'] = filename
            if make_dict == 'yes':
                file_dict = make_identifiers_dict_per_file(data)
                all_identifiers_dict[filename] = file_dict
                #all_identifiers_dict = pickle.load(open(PATH + "conll_allen_dict.pickle", "rb"))
            total_data = pd.concat([total_data, data])
    return total_data, all_identifiers_dict

def find_most_frequent(polarity_list_sent):
    '''returns the most frequent polarity of a list of polarities from one sentence, if there are positive and negative 
    occur as often as each other, neutral is returned'''
    polarity_counter = Counter(polarity_list_sent)
    highest = polarity_counter.most_common(2)
    try:
        if highest[0][1] == highest [1][1]: #positive and negative occur equal
            polarity = 'neutral'
        else:
            polarity = highest[0][0]
    except:
        polarity = highest[0][0]
    return polarity

def build_perspective_graph(input, conll_allen, conll_allen_dict, all_relevant_sent_ids, counter = 'no'):
    '''loops through concated inputfile, takes filename, sentence id, calculates polarity, sentence text, finds source
    cue content and builds this into a knowledge graph dictionary'''
    knowledge_graph = dict()
    file_sent_id_list = []
    file_list = []
    
    for index, row in input.iterrows():  #must loop through stan's file
        file = row['file']
        sent_id = row['sent_id']
        file_sent_id = file + str(sent_id)
        if counter == 'no':
            sent_text = row['sent_text']
        else:  #if input is the opinion data should be row['text']
            sent_text = row['text']
        polarity = row['polarity']
        
        if file not in file_list:   #if new file, make a new file dictionary, has sentence ids as keys
            file_dict = dict() 
            file_list.append(file)  # keeps track which files are done
        if str(sent_id) not in all_relevant_sent_ids[file]:   #checking if new sent_id is relevant if not add previous one to knowledge graph
            add_to_knowledge_graph_if_last(knowledge_graph, file_dict, input, index, file) #adds the last because of conditions in function
            continue
        if file_sent_id in file_sent_id_list:   #if same sentid, add the previous one to the knowledge graph if last
            if counter == 'yes':   #in naf opinion files, there are multiple polarities for one sentence, in vader not
                try:
                    polarity_list_sent.append(polarity)
                except:
                    polarity_list_sent = []
                    polarity_list_sent.append(polarity)
            polarity = find_most_frequent(polarity_list_sent)
            add_to_knowledge_graph_if_last(knowledge_graph, file_dict, input, index, file) #adds if the last sentence because of conditions in function
        else:   #sent id is new
            polarity_list_sent = []
            file_sent_id_list.append(file_sent_id)  #keeps track which files+sentences are done
        
        sent_dict = get_source_cue_content(file, sent_id, conll_allen, conll_allen_dict)
        if not sent_dict:
            add_to_knowledge_graph_if_last(knowledge_graph, file_dict, input, index, file)
            continue
        else:    
            sent_dict['polarity'] = polarity.strip()
            sent_dict['sent_text'] = sent_text.strip()
            file_dict[sent_id] = sent_dict
        
            polarity_list_sent = add_to_knowledge_graph_if_last(knowledge_graph, file_dict, input, index, file)
            print(knowledge_graph)
    
    #with open('knowledge_graph_1.pickle', 'wb') as kg:
        #pickle.dump(knowledge_graph, kg, protocol=pickle.HIGHEST_PROTOCOL)
    return knowledge_graph


def add_word_to_id_in_dict(id, row, identifiers_dict):
    '''finds identifier, splits it and adds all words with that idnumber to a dictionary'''
    if id is not '_' and isinstance(id, str):    #there is no # in source and content, not necassary to split
        number = id.split('-')[-1]         
        token_word = row['word'] + ' '
        token_word = token_word.encode('latin1',errors='ignore').decode('cp1252',errors='ignore')
        token_word = re.sub('[^a-zA-Z0-9 \n\.]', '', token_word)
        try:
            token_word = token_word.encode('latin1').decode('Windows-1252')
        except:
            try:
                token_word = token_word.encode('cp1252').decode()
            except:
                print(token_word)
        if identifiers_dict.get(number):
            already_added_word = identifiers_dict.get(number)
        else:
            already_added_word = ''
        already_added_word += token_word
        identifiers_dict[number] = already_added_word
    return identifiers_dict

def add_content_to_ids_in_dict(content_id, row, identifiers_dict):
    '''finds identifier, splits it and adds all words with that idnumber to a dictionary'''
    if content_id is not '_' and isinstance(content_id, str):
        if '#' in content_id:    #I-content-21648 #B-content-21676:21674-Cue_21675-Source
            content_ids = re.split('#|:', content_id)
            for part in content_ids:
                if part.startswith('B') or part.startswith('I'):
                    content_number = part.split('-')[-1] 
        else:  #B-content-21485:21483-Source_21484-Cue_21486-Source                
            content_number = content_id.split(':')[0].split('-')[-1]
        try:
            content_word = row['word'] + ' '
            content_word = content_word.encode('latin1',errors='ignore').decode('cp1252',errors='ignore')
            content_word = re.sub('[^a-zA-Z0-9 \n\.]', '', content_word)
        except:
            content_word = 'NaN'  #mistake in NAF files where word is type NaN
        
        if identifiers_dict.get(content_number):                
            already_added_content = identifiers_dict.get(content_number)
        else:
            already_added_content = ''
        already_added_content += content_word                
        identifiers_dict[content_number] = already_added_content
    return identifiers_dict


def make_identifiers_dict_per_file(data):
    '''loop through file and save all words to source, cue, content as words
    Returns the dictionary of that file and adds it with filename as key to total dictionary'''
    identifiers_dict = dict()
    for i, r in data.iterrows():
        source = r['attr_source']
        cue = r['attr_cue']
        content = r['attr_content']
        
        identifiers_dict = add_word_to_id_in_dict(source, r, identifiers_dict)
        identifiers_dict = add_word_to_id_in_dict(cue, r, identifiers_dict)
        identifiers_dict = add_content_to_ids_in_dict(content, r, identifiers_dict) 
    return identifiers_dict

def get_source_cue_content(file, sent_id, conll_allen, conll_allen_dict):
    '''Loops through the conll_allen rows of the file and sentence id of which the source cue content is wanted, looks for the 
    identifier that specifies the source cue content triple, takes their ids and looks them up in the id-dictionary of that file
    Output = a dictionary of the sentence with source, cue and value as keys 
    that gets assigned to sentence id outside the function, if id does not excist it goes to next iteration'''
    sentence_dict = dict()
    selection = conll_allen.loc[(conll_allen['file'] == file)&(conll_allen['sent_id'] == sent_id)]
    for i, r in selection.iterrows():
        identifier = r['attr_content']
        if '#' in identifier:                     #I-content-21648#B-content-21676:21674-Cue_21675-Source
            list_ids = identifier.split('#')
            for id in list_ids:
                if id.startswith('B') and 'Cue' in id:  #only take triples where there is a cue
                    wanted_id = id                #B-content-21676:21674-Cue_21675-Source
                else:
                    continue
            continue
        elif ':' in identifier and 'Cue' in identifier: #only take triples where there is a cue
            wanted_id = identifier    #B-content-21485:21483-Source_21484-Cue_21486-Source
        else:
            continue
        
        id_splitted = re.split(':|_', wanted_id)
        for id in id_splitted:
            if 'content' in id:
                content_wanted = id.split('-')[-1]    #B-content-21485
            elif 'Source' in id:
                source_wanted = id.split('-')[0]      #21483-Source
                continue                              #only takes the first source
            elif 'Cue' in id:
                cue_wanted = id.split('-')[0]         #21484-Cue
                
        try:
            source = conll_allen_dict[file].get(source_wanted)  #lookup id in id dictionary of this file
            source_wanted = ''
            if source is None:
                source = 'Unknown'
        except:
            source= 'Unknown'
        cue = conll_allen_dict[file].get(cue_wanted) #lookup id in dictionary of this file
        cue_wanted = ''
            
        try:
            content = conll_allen_dict[file].get(content_wanted) #lookup id in dictionary of this file
            if content is None:
                content = 'Unknown'
            content_wanted = ''
        except:
            content = 'Unknown'
            
        sentence_dict['source'] = source.strip() #needs.strip() but gives error if None. 5605 in ABC not in id dict
        sentence_dict['cue'] = cue.strip() #needs .strip() but gives error if None. 5605 in ABC not in id dict
        sentence_dict['content'] = content.strip() #needs .strip() but gives error if None. 5605 in ABC not in id dict
    return sentence_dict

def make_output_csv(knowledge_graph, name):
    data = pd.DataFrame.from_dict({(i,j): knowledge_graph[i][j] 
                           for i in knowledge_graph.keys() 
                           for j in knowledge_graph[i].keys()},
                       orient='index')
    data.to_csv(name, index=True)

def add_to_knowledge_graph_if_last(knowledge_graph, file_dict, opinion_data, index, file): 
    '''Checks whether it's the last sentence of the document or of a file if true, adds file_dict to knowledge graph'''
    if index == opinion_data.iloc[-1, 0]:
        knowledge_graph[file] = file_dict
    elif file != opinion_data.loc[index+1, 'file']:
        knowledge_graph[file] = file_dict

In [1]:
def main():
    all_relevant_sent_ids = get_relevant_sent_ids()
    opinion_data, opinion_dict = load_files(OPINION)
    vader_data, vader_dict = load_files(VADER)
    change_format_for_grouping(opinion_data)
    change_format_for_grouping(vader_data)
    conll_allen, conll_allen_dict = load_files(ALLEN, delimiter = '\t', make_dict = 'yes')
    #with open('conll_allen_dict_fullcontent.pickle', 'wb') as dct:
    #    pickle.dump(conll_allen_dict, dct, protocol=pickle.HIGHEST_PROTOCOL)
    #with open('conll_allen_fullcontent.pickle', 'wb') as cnl:
    #    pickle.dump(conll_allen, cnl, protocol=pickle.HIGHEST_PROTOCOL)
    #conll_allen = pickle.load(open(PATH + "conll_allen.pickle", "rb"))
    #conll_allen_dict = pickle.load(open(PATH + "conll_allen_dict.pickle", "rb"))
    first_knowledge_graph = build_perspective_graph(opinion_data, conll_allen, conll_allen_dict, 
                                                    all_relevant_sent_ids, counter = 'yes')
    second_knowledge_graph = build_perspective_graph(vader_data, conll_allen, conll_allen_dict, all_relevant_sent_ids)
    make_output_csv(first_knowledge_graph, 'First_Knowledge_Graph.csv')
    make_output_csv(second_knowledge_graph, 'Second_Knowledge_Graph.csv')

main()