# Environment Setup

In [None]:
# Import Libraries and Set Random State for Reproducability
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
random_state = 42

# Data Analysis

## Remove Duplicates from Original Dataset

In [None]:
# Load Full Dataset into Pandas DataFrame
PATH = "../data/available_metahate.tsv"
df = pd.read_csv(PATH, sep = "\t")
df.head()

In [None]:
# Print Number of Rows and Cols for Inspection
print(f"Num Rows: {df.shape[0]}, Num Cols: {df.shape[1]}")

In [None]:
# Drop Duplicate Text posts as a sanity check 
df = df.drop_duplicates(subset = ["text"], keep = "first").reset_index(drop = True)
df.head()

In [None]:
# Print Number of Rows and Cols for Inspection
print(f"Num Rows: {df.shape[0]}, Num Cols: {df.shape[1]}")

## Investigate Class Imbalance

In [None]:
# Print Series of label counts to investigate class imbalance
label_counts = df['label'].value_counts()
label_counts

In [None]:
# Convert Series --> DataFrame
label_counts = label_counts.reset_index()
label_counts

In [None]:
# Change column names
label_counts.columns = ['label', 'count']
label_counts.head()

In [None]:
# Plot bar plot showing Class Imbalance
sns.barplot(data=label_counts, x='label', y='count', color='skyblue')
plt.title("Class Distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

# Create Train-Dev-Test Splits

In [None]:
# Keep 3% of our data
df, _ = train_test_split(df, train_size = 0.03, random_state=random_state, shuffle = True, stratify = df['label'])
df.head()

In [None]:
# First stratified split: train (80%) and temp (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=random_state, shuffle = True, stratify = df['label'])

# Second stratified split: dev (10%) and test (10%)
dev_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=random_state, shuffle = True, stratify = temp_df['label'])

# Reset index for train
train_df = train_df.reset_index(drop = True)

# Reset index for dev
dev_df = dev_df.reset_index(drop = True)

# Reset index for test
test_df = test_df.reset_index(drop = True)

# Check sizes
print(len(train_df), len(dev_df), len(test_df))

# Save Train-Dev-Test Splits

In [None]:
train_df.to_csv("data/train_data.csv", index = False)
dev_df.to_csv("data/dev_data.csv", index = False)
test_df.to_csv("data/test_data.csv", index = False)