In [4]:
import pandas as pd
from IPython.display import display

df = pd.read_csv("filtered_main_data_copy.csv")

# Step 1: Keep only relevant columns
df_filtered = df[['id', 'title', 'description', 'topic_categories', 'category', 'thumbnail', 'tags']]

# Step 2: Handle missing values (fill NaN with empty string)
df_filtered = df_filtered.fillna('')

# Step 3: Define "distracting" keywords
distracting_keywords = [
    "funny", "prank", "game", "meme", "entertainment", "fail", "drama",
    "reaction", "challenge", "vlog", "epic", "weird", "random", "amazing", "slander",
    "brainrot", "gyatt", "skibidi", "toilet", "skibidi toilet", "skibidi toilet challenge"
]

# Step 4: Function to classify as "distracting" or "not distracting"
def is_distracting(row):
    text = str(row['title']).lower() + " " + str(row['description']).lower()
    
    # Check for keywords in title/description
    if any(keyword in text for keyword in distracting_keywords):
        return 1  # Distracting

    # Check for distracting categories (Entertainment, Gaming, Vlogs)
    if row['category'] in [24, 20, 17]:
        return 1  # Distracting

    return 0  # Not Distracting

# Apply classification
df_filtered['distracting'] = df_filtered.apply(is_distracting, axis=1)

# Step 5: Convert Categorical Features to Numeric
df_filtered['title_numeric'] = df_filtered['title'].astype('category').cat.codes
df_filtered['description_numeric'] = df_filtered['description'].astype('category').cat.codes
df_filtered['topic_categories_numeric'] = df_filtered['topic_categories'].astype('category').cat.codes

# Save preprocessed data for modeling
preprocessed_path = "filtered_main_data_copy.csv"
df_filtered.to_csv(preprocessed_path, index=False)


display(df_filtered)

Unnamed: 0,id,title,description,topic_categories,category,thumbnail,tags,distracting,title_numeric,description_numeric,topic_categories_numeric
0,B0RVWU_nROk,I used a flip phone for 30 days,Go check out my habits course: https://slowgro...,,1,https://i.ytimg.com/vi/B0RVWU_nROk/hqdefault.jpg,,0,2233,1258,0
1,E9se1YqCRaA,Why we’re leaving California.,Head to http://squarespace.com/mattdavella to ...,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,1,https://i.ytimg.com/vi/E9se1YqCRaA/hqdefault.jpg,,0,4745,1357,175
2,AvdW0YKw0XE,We sold all our stuff (seriously).,Watch the trailer to my new Netflix film!! 😮\n...,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,1,https://i.ytimg.com/vi/AvdW0YKw0XE/hqdefault.jpg,,0,4570,4076,164
3,gp3SPyGXKuo,How meditation can change your life.,Get Sam's app here: https://wakingup.com/\nLea...,['https://en.wikipedia.org/wiki/Society'],1,https://i.ytimg.com/vi/gp3SPyGXKuo/hqdefault.jpg,,0,1986,1200,268
4,b2WYP057IOI,What I eat every week to stay healthy.,"Go to http://squarespace.com for a free trial,...","['https://en.wikipedia.org/wiki/Food', 'https:...",1,https://i.ytimg.com/vi/b2WYP057IOI/hqdefault.jpg,,0,4599,1268,87
...,...,...,...,...,...,...,...,...,...,...,...
5126,bCZ24rvV3sI,So I Did the Covid-19 Test at Home.,Get Tested! both for Corona and STD's!,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,24,https://i.ytimg.com/vi/bCZ24rvV3sI/hqdefault.jpg,"['cover-19', 'corona', 'coronates', 'results',...",1,3662,1205,147
5127,Jg3QxzQ5gfs,Getting My First Tattoo! | Post Quarantine Vlog,So they let us out and ya girl went and got a ...,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,24,https://i.ytimg.com/vi/Jg3QxzQ5gfs/hqdefault.jpg,"['Tattoo', 'vlog', 'firstTattoo', 'neck tattoo...",1,1650,3059,175
5128,eCR6yHNnenI,A Week in the Life of an Oxford student ONLINE,Other Oxford related Videos:\n\nMoving into Ox...,['https://en.wikipedia.org/wiki/Lifestyle_(soc...,24,https://i.ytimg.com/vi/eCR6yHNnenI/hqdefault.jpg,"['onlineUniversity', 'OxfordUniversity', 'A we...",1,353,2683,164
5129,d7b3s18HojI,WHO'S MOST LIKELY TO (part 2!),Watch part 1 next: https://youtu.be/mIoZPS0YsWg,['https://en.wikipedia.org/wiki/Entertainment'],24,https://i.ytimg.com/vi/d7b3s18HojI/hqdefault.jpg,"[""who's most likely to"", 'funny', 'blavkgirls'...",1,4537,4056,66
