In [5]:
import pandas as pd


# Load all CSV files that already cleaned
news_comments = pd.read_csv('../data/news_comments_clean.csv')
youtube_comments = pd.read_csv('../data/youtube_comments_clean.csv')
tiktok_comments = pd.read_csv('../data/tiktok_comments_clean.csv')

In [7]:
# Drop potential duplicate columns (Unnamed, duplicates from previous runs)
news_comments = news_comments.loc[:, ~news_comments.columns.str.contains('^Unnamed')]
youtube_comments = youtube_comments.loc[:, ~youtube_comments.columns.str.contains('^Unnamed')]
tiktok_comments = tiktok_comments.loc[:, ~tiktok_comments.columns.str.contains('^Unnamed')]

# Drop Unnamed columns safely
news_comments = news_comments.loc[:,~news_comments.columns.duplicated()]
youtube_comments = youtube_comments.loc[:,~youtube_comments.columns.duplicated()]
tiktok_comments = tiktok_comments.loc[:,~tiktok_comments.columns.duplicated()]

# insert a new column 'source' to identify the source of the comments
news_comments['source'] = 'news'
youtube_comments['source'] = 'youtube'
tiktok_comments['source'] = 'tiktok'

# if the column 'clean_text' exists, rename it to 'comment'
if 'clean_text' in news_comments.columns:
    news_comments = news_comments.rename(columns={'clean_text': 'comment'})
if 'clean_text' in youtube_comments.columns:
    youtube_comments = youtube_comments.rename(columns={'clean_text': 'comment'})
if 'clean_text' in tiktok_comments.columns:
    tiktok_comments = tiktok_comments.rename(columns={'clean_text': 'comment'})

# retrieve only the 'comment' and 'source' columns
news_comments = news_comments[['comment', 'source']]
youtube_comments = youtube_comments[['comment', 'source']]
tiktok_comments = tiktok_comments[['comment', 'source']]

# Concatenate all DataFrames into one
all_comments = pd.concat([news_comments, youtube_comments, tiktok_comments], ignore_index=True)

# Shuffle the DataFrame
all_comments = all_comments.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined DataFrame to a new CSV file
all_comments.to_csv('../data/all_comments_for_labeling.csv', index=False)


print("✅ File labeling created succesfully!")

✅ File labeling created succesfully!
