# Data Cleaning

In [None]:
import pandas as pd
import nltk
import re

## Primary dataset

### Import data

In [None]:
df = pd.read_csv('raw_data')
df = df[['Organization Name', 'Cleaned Text', 'True Organization']]

In [None]:
df['Text'] = df['Text'].str.lower()
df['True Organization'] = df['True Organization'].str.lower()

df = df.dropna(subset=['True Organization'])
df = df.dropna(subset=['Text'])

### Clean data

In [None]:
def remove_whitespaces(text):
    '''Removes repetitive patterns'''
    cleaned_text = re.sub(r'(\s)\1+', r'\1', text)  # remove repeated whitespace characters
    return cleaned_text.strip() 

def control_char_repetition(text, max_repeat=3):
    """Limits character or combination repetition to a maximum count"""
    text = re.sub(rf"(.)\1{{{max_repeat},}}", r"\1" * max_repeat, text)  # replace repeated chars with max_repeat occurrences

    # replace repeated sequences of words
    text = re.sub(rf"(\b\w+\b)(\s+\1){{{max_repeat},}}", (r"\1 " * max_repeat).strip(), text)  # limit repeated words
    text = re.sub(rf"(\b\S{{2,}}\b)(\s+\1){{{max_repeat},}}", (r"\1 " * max_repeat).strip(), text)  # limit repeated character sequences
    return text

def clean_text(text):
    '''Cleans text by lowering, removing extra whitespace, controlling repetitions, and tokenizing'''
    text = text.lower()
    text = remove_whitespaces(text)
    text = control_char_repetition(text, max_repeat = 2)
    tokens = nltk.word_tokenize(text)
    return " ".join(tokens)

df['Cleaned Text'] = df['Text'].apply(clean_text)

comparison = df[['Text', 'Cleaned Text']].head(1) 
# print(comparison)

formatted_data = df[['Cleaned Text']].values.tolist()

for example in formatted_data[:5]:
    print(example[0])

### Save as cleaned csv

In [None]:
df.to_csv('final_data.csv')

## Second dataset

### Import data

In [None]:
seconddata = pd.read_csv('raw_seconddata.csv')

In [None]:
seconddata['Text'] = seconddata['Text'].str.lower()
seconddata['Organisation Name'] = seconddata['Organisation Name'].str.lower()

seconddata = seconddata.dropna(subset=['Organisation Name'])
seconddata = seconddata.dropna(subset=['Text'])

seconddata = seconddata.rename(columns={'Organisation Name': 'True Organization'})

seconddata['True Organization'] = seconddata.apply(
    lambda row: row['Afkorting Uitgebreid'] if pd.notnull(row['Afkorting Uitgebreid']) else row['True Organization'],
    axis=1
)

seconddata = seconddata[['Text', 'True Organization']]

### Clean data

In [None]:
seconddata['Cleaned Text'] = seconddata['Text'].apply(clean_text)

comparison = seconddata[['Text', 'Cleaned Text']].head(1) 
# print(comparison)

formatted_data = comparison[['Cleaned Text']].values.tolist()

for example in formatted_data[:5]:
    print(example[0])


In [None]:
# filter out texts without organization name in it
def is_org_in_text(row):
    text = row['Text']
    org_name = row['True Organization']
    escaped_org_name = re.escape(org_name)
    pattern = rf"\b{escaped_org_name}\b"
    
    if re.search(pattern, text):
        return True
    return False

seconddata['Contains_Org'] = seconddata.apply(is_org_in_text, axis=1)
filtered_seconddata = seconddata[seconddata['Contains_Org'] == True].drop(columns=['Contains_Org'])

### Save as cleaned csv

In [None]:
filtered_seconddata.to_csv('final_seconddata.csv')