### Import Files and Merge

In [1]:
%pip install pandas numpy matplotlib seaborn "pyarrow==15.0.0"

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import pyarrow.parquet as pq
import seaborn as sns
import os
import re
import glob

In [3]:
# Import files
FOLDER_PATH = './datasets'
COMMENT_FILE = os.path.join(FOLDER_PATH, 'merged_comments.csv')
comments_df = pd.read_csv(COMMENT_FILE)
VIDEO_FILE = os.path.join(FOLDER_PATH, 'videos.csv')
videos_df = pd.read_csv(VIDEO_FILE)

In [4]:
# Helper functions
# textOriginal
def clean_text(text):
    """Takes raw comment text and prepares it for NLP."""
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'http\S+', '', text)      # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)    # Remove punctuation, numbers, etc.
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# topicCategory
def parse_topics(topic_string):
    """Extracts clean topic names from the Wikipedia URL list."""
    if pd.isna(topic_string): return []
    try:
        topics = re.findall(r'/wiki/([^,\'\]]+)', topic_string)
        return [topic.replace('_', ' ') for topic in topics]
    except:
        return []

# PublishedAt & UpdatedAt
def parse_duration(duration_str):
    """Converts ISO 8601 duration string (e.g., 'PT1M30S') into total seconds (e.g., 90)."""
    if not isinstance(duration_str, str) or 'P' not in duration_str: return None
    try:
        duration_str = duration_str.replace('P', '').replace('T', '')
        hours = re.search(r'(\d+)H', duration_str)
        minutes = re.search(r'(\d+)M', duration_str)
        seconds = re.search(r'(\d+)S', duration_str)
        total_seconds = 0
        if hours: total_seconds += int(hours.group(1)) * 3600
        if minutes: total_seconds += int(minutes.group(1)) * 60
        if seconds: total_seconds += int(seconds.group(1))
        return total_seconds
    except:
        return None

In [5]:
# Define processed chunks path
PROCESSED_DIR = os.path.join(FOLDER_PATH, 'processed_chunks/')
os.makedirs(PROCESSED_DIR, exist_ok=True)
    
# Process merged_comments.csv into chunks
chunk_size = 100000
comment_chunks_iterator = pd.read_csv(COMMENT_FILE, chunksize=chunk_size)
chunk_num = 0

print("\nStarting the main processing loop. This will take several minutes...")

for chunk in comment_chunks_iterator:
    chunk_num += 1
    print(f"  > Processing chunk {chunk_num}...")

    # A. Merge the comment chunk with our videos lookup table
    merged_chunk = pd.merge(chunk, videos_df, on='videoId', how='left', suffixes=('_comment', '_video'))

    # B. Clean up missing text to prevent errors
    text_cols = ['textOriginal', 'title', 'description', 'tags']
    for col in text_cols:
        if col in merged_chunk.columns:
            # MODIFICATION 1: Avoid 'inplace=True' which is being deprecated. This is safer.
            merged_chunk[col] = merged_chunk[col].fillna("")

    # C. Convert date columns from text to actual datetime objects
    merged_chunk['publishedAt_comment'] = pd.to_datetime(merged_chunk['publishedAt_comment'], errors='coerce')
    merged_chunk['publishedAt_video'] = pd.to_datetime(merged_chunk['publishedAt_video'], errors='coerce')

    # D. Create new features using our helper functions
    merged_chunk['comment_length'] = merged_chunk['textOriginal'].str.len()
    merged_chunk['is_reply'] = merged_chunk['parentCommentId'].notna()
    merged_chunk['cleaned_text'] = merged_chunk['textOriginal'].apply(clean_text)
    merged_chunk['video_topics'] = merged_chunk['topicCategories'].apply(parse_topics)
    merged_chunk['duration_seconds'] = merged_chunk['contentDuration'].apply(parse_duration)

    # MODIFICATION 2 (THE FIX): Convert the 'video_topics' list into a simple string.
    # We join the list with commas. If a cell is not a list, it becomes an empty string.
    merged_chunk['video_topics'] = merged_chunk['video_topics'].apply(
        lambda x: ', '.join(x) if isinstance(x, list) else ''
    )

    # E. Save the final, processed chunk to a new Parquet file
    output_file = os.path.join(PROCESSED_DIR, f'processed_chunk_{chunk_num}.parquet')
    merged_chunk.to_parquet(output_file, index=False) # Added index=False as it's good practice

print(f"\n✅ SUCCESS! Processing is complete.")
print(f"Saved {chunk_num} fully processed Parquet files to the folder: '{PROCESSED_DIR}'")


Starting the main processing loop. This will take several minutes...


  > Processing chunk 1...
  > Processing chunk 2...
  > Processing chunk 3...
  > Processing chunk 4...
  > Processing chunk 5...
  > Processing chunk 6...
  > Processing chunk 7...
  > Processing chunk 8...
  > Processing chunk 9...
  > Processing chunk 10...
  > Processing chunk 11...
  > Processing chunk 12...
  > Processing chunk 13...
  > Processing chunk 14...
  > Processing chunk 15...
  > Processing chunk 16...
  > Processing chunk 17...
  > Processing chunk 18...
  > Processing chunk 19...
  > Processing chunk 20...
  > Processing chunk 21...
  > Processing chunk 22...
  > Processing chunk 23...
  > Processing chunk 24...
  > Processing chunk 25...
  > Processing chunk 26...
  > Processing chunk 27...
  > Processing chunk 28...
  > Processing chunk 29...
  > Processing chunk 30...
  > Processing chunk 31...
  > Processing chunk 32...
  > Processing chunk 33...
  > Processing chunk 34...
  > Processing chunk 35...
  > Processing chunk 36...
  > Processing chunk 37...
  > Proces

### Append Text Columns into a New Column

In [6]:
# Define the path to your processed chunks
PROCESSED_DIR = 'datasets/processed_chunks/'

# We only need to inspect the first file to see the schema
first_chunk_file = os.path.join(PROCESSED_DIR, 'processed_chunk_1.parquet')

# Check if the file exists before trying to read it
if os.path.exists(first_chunk_file):
    # Read the metadata from the Parquet file
    parquet_file = pq.ParquetFile(first_chunk_file)
    
    # Print the column names from the file's schema
    print("Columns in the Parquet file:")
    print(parquet_file.schema.names)
else:
    print(f"Error: The file '{first_chunk_file}' was not found.")
    print("Please make sure your first processing script ran successfully.")

Columns in the Parquet file:
['kind_comment', 'commentId', 'channelId_comment', 'videoId', 'authorId', 'textOriginal', 'parentCommentId', 'likeCount_comment', 'publishedAt_comment', 'updatedAt', 'quarter', 'kind_video', 'publishedAt_video', 'channelId_video', 'title', 'description', 'tags', 'defaultLanguage', 'defaultAudioLanguage', 'contentDuration', 'viewCount', 'likeCount_video', 'favouriteCount', 'commentCount', 'topicCategories', 'comment_length', 'is_reply', 'cleaned_text', 'video_topics', 'duration_seconds']


In [7]:
# Define the directory where your processed chunks are stored
PROCESSED_DIR = 'datasets/processed_chunks/'

# Get a list of all the chunked Parquet files
parquet_files = sorted(glob.glob(os.path.join(PROCESSED_DIR, 'processed_chunk_*.parquet')))

print(f"Found {len(parquet_files)} Parquet files to update in place.")

# Loop through each file chunk
for i, file_path in enumerate(parquet_files):
    print(f"  > Updating chunk {i+1}/{len(parquet_files)}: {os.path.basename(file_path)}")
    
    # Load one chunk (this is memory-safe)
    df_chunk = pd.read_parquet(file_path)

    # --- Your concatenation logic (same as before) ---
    columns_to_concat = ['textOriginal', 'title', 'description', 'tags', 'video_topics']
    
    for col in columns_to_concat:
        if col in df_chunk.columns:
            df_chunk[col] = df_chunk[col].fillna("").astype(str)
        else:
            df_chunk[col] = ""

    # Create the new 'textAvailable' column
    df_chunk["textAvailable"] = (
        df_chunk["textOriginal"] + " " +
        df_chunk["title"] + " " +
        df_chunk["description"] + " " +
        df_chunk["tags"] + " " +
        df_chunk["video_topics"]
    ).str.strip()
    
    # --- Overwrite the original file with the updated DataFrame ---
    # We save back to the *exact same file_path*
    df_chunk.to_parquet(file_path, index=False)

print(f"\n✅ SUCCESS! All {len(parquet_files)} chunk files have been updated with the 'textAvailable' column.")

Found 48 Parquet files to update in place.
  > Updating chunk 1/48: processed_chunk_1.parquet


  > Updating chunk 2/48: processed_chunk_10.parquet
  > Updating chunk 3/48: processed_chunk_11.parquet
  > Updating chunk 4/48: processed_chunk_12.parquet
  > Updating chunk 5/48: processed_chunk_13.parquet
  > Updating chunk 6/48: processed_chunk_14.parquet
  > Updating chunk 7/48: processed_chunk_15.parquet
  > Updating chunk 8/48: processed_chunk_16.parquet
  > Updating chunk 9/48: processed_chunk_17.parquet
  > Updating chunk 10/48: processed_chunk_18.parquet
  > Updating chunk 11/48: processed_chunk_19.parquet
  > Updating chunk 12/48: processed_chunk_2.parquet
  > Updating chunk 13/48: processed_chunk_20.parquet
  > Updating chunk 14/48: processed_chunk_21.parquet
  > Updating chunk 15/48: processed_chunk_22.parquet
  > Updating chunk 16/48: processed_chunk_23.parquet
  > Updating chunk 17/48: processed_chunk_24.parquet
  > Updating chunk 18/48: processed_chunk_25.parquet
  > Updating chunk 19/48: processed_chunk_26.parquet
  > Updating chunk 20/48: processed_chunk_27.parquet
  

In [8]:
# Define the path to your processed chunks directory
PROCESSED_DIR = 'datasets/processed_chunks/'

# Choose any chunk file to inspect, for example, the first one
file_to_check = os.path.join(PROCESSED_DIR, 'processed_chunk_1.parquet')

print(f"Inspecting file: {os.path.basename(file_to_check)}")

# Load just this single, small chunk
df_one_chunk = pd.read_parquet(file_to_check)

# 1. Check the columns
print("\n✅ Columns in the updated chunk:")
print(df_one_chunk.columns)

# 2. Check the top 5 rows
print("\n✅ Top 5 rows of the updated chunk:")
df_one_chunk.head()

Inspecting file: processed_chunk_1.parquet



✅ Columns in the updated chunk:
Index(['kind_comment', 'commentId', 'channelId_comment', 'videoId', 'authorId',
       'textOriginal', 'parentCommentId', 'likeCount_comment',
       'publishedAt_comment', 'updatedAt', 'quarter', 'kind_video',
       'publishedAt_video', 'channelId_video', 'title', 'description', 'tags',
       'defaultLanguage', 'defaultAudioLanguage', 'contentDuration',
       'viewCount', 'likeCount_video', 'favouriteCount', 'commentCount',
       'topicCategories', 'comment_length', 'is_reply', 'cleaned_text',
       'video_topics', 'duration_seconds', 'textAvailable'],
      dtype='object')

✅ Top 5 rows of the updated chunk:


Unnamed: 0,kind_comment,commentId,channelId_comment,videoId,authorId,textOriginal,parentCommentId,likeCount_comment,publishedAt_comment,updatedAt,...,likeCount_video,favouriteCount,commentCount,topicCategories,comment_length,is_reply,cleaned_text,video_topics,duration_seconds,textAvailable
0,youtube#comment,1781382,14492,74288,2032536,PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...,,0,2023-08-15 21:48:52+00:00,2023-08-15 21:48:52+00:00,...,307922.0,0.0,5901.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,49,False,please lesbian flag i beg you you would rock it,"Lifestyle (sociology), Physical attractiveness",29.0,PLEASE LESBIAN FLAG I BEG YOU \n\nYou would ro...
1,youtube#comment,289571,14727,79618,3043229,Apply mashed potato juice and mixed it with curd,3198066.0,0,2023-10-02 13:08:22+00:00,2023-10-02 13:08:22+00:00,...,55043.0,0.0,164.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,48,True,apply mashed potato juice and mixed it with curd,"Lifestyle (sociology), Physical attractiveness",60.0,Apply mashed potato juice and mixed it with cu...
2,youtube#comment,569077,3314,51826,917006,69 missed calls from mars👽,,0,2024-05-31 12:03:12+00:00,2024-05-31 12:03:12+00:00,...,313755.0,0.0,4226.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,26,False,missed calls from mars,"Lifestyle (sociology), Physical attractiveness",20.0,69 missed calls from mars👽 How To Make Small E...
3,youtube#comment,2957962,5008,58298,1853470,Baaa,,0,2024-02-13 15:48:37+00:00,2024-02-13 15:48:37+00:00,...,11349.0,0.0,286.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,4,False,baaa,"Lifestyle (sociology), Physical attractiveness",25.0,Baaa 20sec beauty test: BLUSH PLACEMENT for YO...
4,youtube#comment,673093,21411,1265,2584166,you look like raven from phenomena raven no cap,,0,2020-02-15 22:28:44+00:00,2020-02-15 22:28:44+00:00,...,504342.0,0.0,19920.0,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,47,False,you look like raven from phenomena raven no cap,"Lifestyle (sociology), Physical attractiveness",1025.0,you look like raven from phenomena raven no ca...
