In [13]:
import pandas as pd

# Load all CSV files that already cleaned
news_comments = pd.read_csv('../data/cleaned/news_comments_clean.csv', encoding='utf-8', delimiter=',')
youtube_comments = pd.read_csv('../data/cleaned/youtube_comments_clean.csv', encoding='utf-8', delimiter=',')
tiktok_comments = pd.read_csv('../data/cleaned/tiktok_comments_clean.csv', encoding='utf-8', delimiter=',')

In [15]:
# 🔧 Function to safely clean columns before processing
def clean_columns(df):
    # Remove unnamed columns (index column after to_csv)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    # Remove duplicated columns
    df = df.loc[:, ~df.columns.duplicated()]
    return df

# 🔧 Apply column cleaning to all dataframes
news_comments = clean_columns(news_comments)
youtube_comments = clean_columns(youtube_comments)
tiktok_comments = clean_columns(tiktok_comments)

# insert a new column 'source' to identify the source of the comments
news_comments['source'] = 'news'
youtube_comments['source'] = 'youtube'
tiktok_comments['source'] = 'tiktok'

# 🔧 Rename 'clean_text' column into 'comment' if exists
if 'clean_text' in news_comments.columns:
    news_comments = news_comments.rename(columns={'clean_text': 'comment'})
if 'clean_text' in youtube_comments.columns:
    youtube_comments = youtube_comments.rename(columns={'clean_text': 'comment'})
if 'clean_text' in tiktok_comments.columns:
    tiktok_comments = tiktok_comments.rename(columns={'clean_text': 'comment'})

# 📌 Keep only 'comment' and 'source' columns
news_comments = news_comments[['comment', 'source']]
youtube_comments = youtube_comments[['comment', 'source']]
tiktok_comments = tiktok_comments[['comment', 'source']]

# 🔗 Concatenate all datasets safely
all_comments = pd.concat([news_comments, youtube_comments, tiktok_comments], ignore_index=True)

# 🔀 Shuffle dataset
all_comments = all_comments.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the combined DataFrame to a new CSV file
#all_comments.to_csv('../data/all_comments_for_labeling.csv', index=False)


print("✅ Merging data completed successfully. File 'all_comments_for_labeling.csv' created.")

✅ Merging data completed successfully. File 'all_comments_for_labeling.csv' created.
