In [1]:
import lxml.etree
from xml.etree import ElementTree
import csv
import pandas as pd
import os
from os import listdir, path
from tqdm import tqdm

In [21]:
def parse_xml(file_path, file_name):
    tree = ElementTree.parse(file_path)
    root = tree.getroot()

    sentences = {}
    non_role_annotation_counts = {}
    unique_repr_texts = {}

    roles_scores = {
        'one_Harasser': ('Harasser', '1'),
        'two_Harasser': ('Harasser', '2'),
        'one_Victim' : ('Victim', '1'),
        'two_Victim' : ('Victim', '2'),
        'one_Bystander_defender' : ('Bystander_defender', '1'),
        'two_Bystander_defender' : ('Bystander_defender', '2'),
        'one_Bystander_assistant' : ('Bystander_assistant', '1'),
        'two_Bystander_assistant' : ('Bystander_assistant', '2')
    }

    for sentence in root.findall(".//sentence"):
        sentence_id = sentence.get('id').split('.')[1]
        sentences[sentence_id] = {
            'sentence_id' : f"s.{sentence_id}",
            'text': " ".join(word.text for word in sentence.findall(".//word") if word.text),
            'Cyberbullying': 0,
        }
        non_role_annotation_counts[sentence_id] = 0
        unique_repr_texts[sentence_id] = set()

    all_labels = [
        'General_insult', 'Assertive_selfdef', 'Curse_Exclusion', 'Threat_Blackmail', 
        'General_defense', 'Other_language', 'Powerless_selfdef', 'Encouraging_harasser',
        'Harmless_sexual_talk', 'Good_characteristics', 'Sarcasm', 'Other', 'Defamation',
        'Attacking_relatives', 'Sexism', 'Racism', 'Sexual_harassment'
    ]

    for annotation in root.findall(".//annotation"):
        sentence_id = annotation.get('words').split('.')[1]
        repr_text = annotation.get('repr').replace("&#182;", "¶")
        if repr_text in unique_repr_texts[sentence_id]:
            continue
        unique_repr_texts[sentence_id].add(repr_text)
        sentences[sentence_id]['Cyberbullying'] = 1

        role_found = False
        for role, (role_desc, score) in roles_scores.items():
            if role in annotation.keys():
                sentences[sentence_id]['role'] = role_desc
                sentences[sentence_id]['harmful_score'] = score
                sentences[sentence_id]['words0'] = repr_text
                role_found = True
                break
        
        if not role_found:
            index = non_role_annotation_counts[sentence_id]
            words_key = f'words{index + 1}'
            label_key = f'label{index + 1}'
            non_role_annotation_counts[sentence_id] += 1

            sentences[sentence_id][words_key] = repr_text

            for label in all_labels:
                if label in annotation.keys():
                    sentences[sentence_id][label_key] = label
                    break
    
    corrected_df = pd.DataFrame.from_dict(sentences, orient='index')
    corrected_df['file_name'] = file_name

    # column in desired order
    dynamic_columns = []
    max_annotations = max(non_role_annotation_counts.values())
    for i in range(1, max_annotations +1):
        if any(f'words{i}' in col for col in corrected_df.columns):
            dynamic_columns.extend([f'words{i}', f'label{i}'])

    base_columns = ['file_name', 'sentence_id', 'text', 'Cyberbullying', 'role', 'harmful_score', 'words0']
    all_columns = base_columns + dynamic_columns
    corrected_df = corrected_df.reindex(columns = all_columns)

    return corrected_df

def process_all_xml_files(directory_path):
    all_dfs = [] #List to store df from each files
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.xml'):
            file_path = os.path.join(directory_path, file_name)
            df = parse_xml(file_path, file_name)
            all_dfs.append(df)

    concatenated_df = pd.concat(all_dfs, ignore_index=True)
    concatenated_df = concatenated_df[concatenated_df['text'].astype(str).str.strip() != '']
    return concatenated_df

xml_directory_path = '/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/xml_folder'

final_df = process_all_xml_files(xml_directory_path)

csv_file_path = '/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/concatenated_results.csv'

final_df.to_csv(csv_file_path, index=False)

print(f'Concatenated CSV file has been saved to: {csv_file_path}')

Concatenated CSV file has been saved to: /Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/concatenated_results.csv


In [3]:
import pandas as pd

# Update 'Cyberbullying' to 0 where 'label1' is 'Other_language'
final_df.loc[final_df['label1'] == 'Other_language', 'Cyberbullying'] = 0

# Save the updated DataFrame back to CSV if needed
final_df.to_csv(csv_file_path, index=False)

  data = pd.read_csv(csv_file_path)


In [None]:
import pandas as pd

# Since the label columns are iterated (label1, label2, ...), 
# let's prepare to iterate through them
max_label_number = 20  # Update this with the maximum number of label columns you have
label_columns = [f'label{i}' for i in range(1, max_label_number + 1)]

# Create a dictionary to store the counts for each unique label
label_counts = {}

# Iterate over each row in the DataFrame
for _, row in final_df.iterrows():
    # Iterate through each label column in the row
    for label_col in label_columns:
        # Check if the label column exists in the DataFrame
        if label_col in final_df.columns:
            # If the label is not NaN (i.e., if it exists in the row)
            if not pd.isna(row[label_col]):
                # Get the label from the row
                label = row[label_col]
                # If the label is already in the dictionary, increment its count
                if label in label_counts:
                    label_counts[label] += 1
                # Otherwise, add the label to the dictionary with a count of 1
                else:
                    label_counts[label] = 1

# Now, let's create new columns for each label with binary values (1 or 0)
for label in label_counts.keys():
    # Initialize the column with zeros
    final_df[label] = 0
    # Iterate again over each row to set the value to 1 where the label is present
    for idx, row in final_df.iterrows():
        for label_col in label_columns:
            if label_col in final_df.columns and label == row[label_col]:
                final_df.at[idx, label] = 1
                break  # Stop checking other label columns for this row if we've found a match

# Show a summary of the updated DataFrame
final_df.head()

In [7]:
final_df = pd.read_csv('/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/concatenated_results1.csv')

  final_df = pd.read_csv('/Users/tszeyenthen/Python Study/jupyter notebbok/Cyberbullying/fyp/amica-cyberbullying-distribute/askfm-cyberbullying-data/concatenated_results1.csv')


In [10]:
import pandas as pd

# Correctly identify NaN values in 'role' column and update 'Cyberbullying' accordingly
final_df.loc[final_df['role'].isna(), 'Cyberbullying'] = 0

# Save the updated DataFrame back to CSV
final_df.to_csv('concatenated_results1.csv', index=False)