# Preprocessing of Ruddit dataset
This notebook is used for preprocessing of the [Ruddit](https://github.com/hadarishav/Ruddit) dataset. The dataset is a collection of Reddit comments, which are annotated for offensiveness. Since the original repository for Ruddit dataset only contains the ID of the posts and comments, the [Ruddit dataset shared in Kaggle](https://www.kaggle.com/datasets/rajkumarl/ruddit-jigsaw-dataset) with the texts already extracted from Reddit was used in this notebook. For running this notebook, please first create the `data` directory inside the `src` folder (same directory that contains this notebook file), then download the dataset from Kaggle and place the `ruddit` directory it in the `data/` directory.

After running this notebook, the preprocessed csv files for train, test and validation splits will be saved in the `data/ruddit/Preprocessed` directory.

In [None]:
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Set matplotlib dpi to 300
plt.rcParams['figure.dpi'] = 300

In [None]:
RUDDIT_PATH = "data/ruddit/Dataset/ruddit_with_text.csv"

In [None]:
df = pd.read_csv(RUDDIT_PATH)

df.info()

# Delete rows with txt == '[deleted]'

In [None]:
# Drop url column
df.drop("url", axis=1, inplace=True)
# Rename txt column to text
df = df.rename(columns={'txt': 'text'})
# Show number of deleted posts
deleted = df["text"] == '[deleted]'
print("Deleted posts: ", len(df[deleted]))
# Drop rows with deleted text
df = df[~deleted]

# Normalize the Offensiveness Score

In [None]:
df["label"] = (df["offensiveness_score"].values + 1.) / 2.
df.drop("offensiveness_score", axis=1, inplace=True)

print("Offensive Score:")
print(f"Mean: {df['label'].mean()}")
print(f"Std: {df['label'].std()}")
print(f"Min: {df['label'].min()}")
print(f"Max: {df['label'].max()}")

# Plotting the Distribution of the Offensive Score

In [None]:
# Plot the distribution of the offensive score
sns.kdeplot(df['label'], fill=True)
plt.title("Distribution of Offensive Score")
plt.xlabel("Offensive Score")
plt.ylabel("Count")
plt.show()

# Split the data into train, validation and test sets

In [None]:
train_df, val_test_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)



# Plot the data distribution of the train, validation and test sets

In [None]:
sns.kdeplot(train_df["label"], label="Train Set")
sns.kdeplot(val_df["label"], label="Validation Set")
sns.kdeplot(test_df["label"], label="Test Set")
plt.title("Distribution of Offensive Score")
plt.xlabel("Offensive Score")
plt.ylabel("Count")
plt.legend()
plt.show()

# Create Pairs

In [None]:
def create_sentence_pairs(df, n=5):
    # Create a list to store the results
    data = []

    # Iterate over each row in the original dataframe
    for index, row in df.iterrows():
        text1 = row['text']
        label1 = row['label']
        
        # Randomly select n other rows from the original dataframe, excluding the current row
        random_rows = df.drop(index).sample(n=n)
        
        for _, random_row in random_rows.iterrows():
            text2 = random_row['text']
            label2 = random_row['label']
            
            # Determine bin_label based on which text has a higher positivity score
            if label1 > label2:
                bin_label = 0
            else:
                bin_label = 1
            
            # Calculate the difference in labels
            dif_label = label1 - label2
            
            # Append the new row to the list of data
            data.append({'text1': text1, 'text2': text2, 'bin_label': bin_label, 'dif_label': dif_label})

    # Create a new dataframe from the list of data
    new_df = pd.DataFrame(data)
    
    return new_df


# Save the train, validation, and test sets to csv files

In [None]:
train_pair_df, val_pair_df, test_pair_df = create_sentence_pairs(train_df), create_sentence_pairs(val_df), create_sentence_pairs(test_df)


# Print the count of bin_label in each dataframe
print("Train Set:")
print(train_pair_df['bin_label'].value_counts())
print("Validation Set:")
print(val_pair_df['bin_label'].value_counts())
print("Test Set:")
print(test_pair_df['bin_label'].value_counts())

# Create Preprocessed folder in data/ruddit directory if it does not exist
if not os.path.exists("data/ruddit/Preprocessed"):
    os.makedirs("data/ruddit/Preprocessed")


# Save the dataframes to csv files
train_pair_df.to_csv("data/ruddit/Preprocessed/train_pair.csv", index=False)
val_pair_df.to_csv("data/ruddit/Preprocessed/val_pair.csv", index=False)
test_pair_df.to_csv("data/ruddit/Preprocessed/test_pair.csv", index=False)