# YouTube Comments Preprocessing Notebook

This notebook demonstrates how to preprocess YouTube comments data related to mask-wearing during the COVID-19 pandemic. It includes the following steps:
1. Importing the dataset
2. Removing URLs from comments
3. Filtering comments based on specific keywords

### Import Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from langdetect import detect

### Define Helper Functions

In [None]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def is_thai(text):
    try:
        return detect(text) == 'th'
    except:
        return False

def contains_keywords(text, keywords):
    return any(keyword in text for keyword in keywords)

def show_yearly_distribution(df):
    df['event_year'] = pd.to_datetime(df['event_year']).dt.year
    yearly_counts = df['event_year'].value_counts().reset_index()
    yearly_counts.columns = ['event_year', 'count']
    yearly_counts = yearly_counts.sort_values(by='event_year')

    fig = px.bar(yearly_counts, x='event_year', y='count', title='Yearly Distribution of Data Count',
                 labels={'event_year': 'Year', 'count': 'Count'}, text='count')

    fig.update_layout(
        xaxis=dict(tickmode='linear'),
        yaxis=dict(title='Count'),
        title=dict(x=0.5)
    )
    fig.show()

### Load Dataset

In [None]:
file_path = 'PATH_TO_YOUR_COMMENT_DATA'
df = pd.read_csv(file_path)
df['event_year'] = pd.to_datetime(df['event_year'])
df.head()

### Remove URLS from Comments

In [None]:
df['comment_text'] = df['comment_text'].apply(remove_urls)

### Show Yearly Distribution

In [None]:
show_yearly_distribution(df)

### Filter Comments Sample Code

In [None]:
us_mask_keywords = ["wear", "mask"]
thai_mask_keywords = ["ใส่", "มาส์ก"]

us_df = df[df['country'] == 'US']
us_df = us_df[us_df['comment_text'].apply(lambda text: contains_keywords(text.lower(), us_mask_keywords))]
us_df.reset_index(drop=True, inplace=True)
us_df.head()

thai_df = df[df['country'] == 'TH']
thai_df = thai_df[thai_df['comment_text'].apply(is_thai)]
thai_df = thai_df[thai_df['comment_text'].apply(lambda text: contains_keywords(text, thai_mask_keywords))]
thai_df.reset_index(drop=True, inplace=True)
thai_df.head()

us_df.to_csv('filtered_mask_comments_us.csv', index=False)
thai_df.to_csv('filtered_mask_comments_th.csv', index=False)

### Alpaca Prompt Format

In [None]:
def reformat_dataset(df):
    reformatted_data = []

    for index, row in df.iterrows():
        instruction = (
            "You are a helpful assistant tasked with analyzing the stances of tweet authors in tweets related to a specific topic. "
            "Determine whether each tweet expresses a Favorable, Against, or Neutral stance."
        )
        input_text = (
            f"Identify the stance of tweet: `{row['comment_text']}` on topic of `face masks`"
        )
        event_date = row['event_date']
        news_publisher = row['news_publisher']
        comment_text = row['comment_text']

        reformatted_data.append([instruction, input_text, event_date, news_publisher, comment_text])

    return pd.DataFrame(reformatted_data, columns=['instruction', 'input', 'event_date', 'news_publisher', 'comment_text'])