In [25]:
from lxml import etree
import FeatureExtraction as FE
import numpy

In [26]:
sexual_predator_ids_file = '../../dataset/training/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt'

In [55]:
def create_csv(input_file_path, output_file_name, batch_size):
    tree = etree.parse(input_file_path)
    author_conversation_node_dictionary = FE.extract_author_conversation_node_dictionary_from_XML(tree)
    del tree
    
    output_file_csv = open(output_file_name, 'w+')
    output_string_list = ['autor', 'number of conversation', 'percent of conversations started by the author',
                         'difference between two preceding lines[s]', 'number of messages sent',
                         'average percent of lines in conversation', 'average percent of characters in conversation',
                         'number of characters sent by the author', 'mean time of messages sent',
                         'number of unique contacted authors', 'avg number of unique authors interacted with per conversation',
                         'total unique authors and unique per chat difference',
                         'conversation num and total unique authors difference',
                         'average question marks per conversations', 'total question marks', 'is sexual predator']
    output_string = ','.join(output_string_list) + "\n"
    
    sexual_predator_ids_list = FE.sexual_predator_ids(sexual_predator_ids_file)
    
    i = 0
    for author in sorted(author_conversation_node_dictionary):
        conversation_nodes = author_conversation_node_dictionary[author]
        output_list = [author,
                       len(conversation_nodes),
                       FE.average_trough_all_conversations(author, conversation_nodes, FE.is_starting_conversation),
                       FE.average_trough_all_conversations(author, conversation_nodes, 
                                    FE.avg_time_between_message_lines_in_seconds_for_author_in_conversation),
                       FE.number_of_messages_sent_by_the_author(author, conversation_nodes),
                       FE.average_trough_all_conversations(author, conversation_nodes,
                                                           FE.percentage_of_lines_in_conversation),
                       FE.average_trough_all_conversations(author, conversation_nodes,
                                                           FE.percentage_of_characters_in_conversation),
                       FE.number_of_characters_sent_by_the_author(author, conversation_nodes),
                       FE.mean_time_of_messages_sent(author, conversation_nodes),
                       FE.number_of_unique_authors_interacted_with(author, conversation_nodes),
                       FE.avg_number_of_unique_authors_interacted_with_per_chat(author, conversation_nodes),
                       FE.difference_unique_authors_per_chat_and_total_unique(
                           FE.number_of_unique_authors_interacted_with(author, conversation_nodes),
                           FE.avg_number_of_unique_authors_interacted_with_per_chat(author, conversation_nodes),
                       ),
                       FE.difference_unique_authors_and_conversations(
                           FE.number_of_unique_authors_interacted_with(author, conversation_nodes),
                           len(conversation_nodes)
                        ),
                       FE.avg_question_marks_per_conversation(author, conversation_nodes),
                       FE.total_question_marks_per_conversation(author, conversation_nodes),
                       '1' if author in sexual_predator_ids_list else '0'
                      ]
        output_string += ','.join(map(str, output_list)) + '\n'
        if i == batch_size:
            output_file_csv.write(output_string)
            output_string = ''
            i = -1
            
        i += 1
        
    output_file_csv.write(output_string)    
    del output_string
    del author_conversation_node_dictionary
    output_file_csv.close()

In [56]:
file_path='../../dataset/training/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'

In [None]:
create_csv(file_path, '../../csv/chat_based_features_training.csv', 1000)

In [58]:
def number_of_unique_authors_interacted_with(author, conversation_nodes):
    author_ids = set()
    for conversation_node in conversation_nodes:
        authors = conversation_node.xpath('.//message//author')
        author_ids |= set([author.text for author in authors])
        
    return len(author_ids) 

In [59]:
def avg_number_of_unique_authors_interacted_with_per_chat(author, conversation_nodes):
    avg_per_chat = []
    for conversation_node in conversation_nodes:
        authors = conversation_node.xpath('.//message//author')
        avg_per_chat.append(len(set([author.text for author in authors])))
        
    return sum(avg_per_chat)/len(conversation_nodes)

In [60]:
def difference_unique_authors_per_chat_and_total_unique(uniq_per_chat, total_unique):
    return abs(uniq_per_chat - total_unique)

In [61]:
def difference_unique_authors_and_conversations(total_unique, number_conversatio):
    return abs(total_unique - number_conversatio)

In [76]:
def avg_question_marks_per_conversation(author, conversation_nodes):
    num_of_question_marks = []
    for conversation_node in conversation_nodes:
        count = 0
        
        if len(conversation_node.xpath('.//message//text')) == 0:
            continue
        
        for message in FE.message_texts_in_conversation(conversation_node):
            if message is not None and '?' in message:
                count += 1
                
        num_of_question_marks.append(count)
        
    return sum(num_of_question_marks)/len(conversation_nodes)

In [77]:
def total_question_marks_per_conversation(author, conversation_nodes):
    count = 0
    for conversation_node in conversation_nodes:
        
        if len(conversation_node.xpath('.//message//text')) == 0:
            continue
        
        for message in FE.message_texts_in_conversation(conversation_node):
            if message is not None and '?' in message:
                count += 1
        
    return count