In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('C:/Users/Administrator/OneDrive/Documents/YouTube_Trending_Songs_Dashboard/data/youtube-top-100-songs-2025.csv')
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [3]:
df = df.drop_duplicates()
print("Rows after removing duplicates:", df.shape)


Rows after removing duplicates: (100, 13)


In [4]:
# Fill missing numeric values
for col in ['view_count', 'duration', 'channel_follower_count']:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Fill missing strings
text_cols = ['title', 'fulltitle', 'description', 'categories', 'tags',
             'live_status', 'thumbnail', 'channel', 'channel_url']
df[text_cols] = df[text_cols].fillna('Unknown')


In [5]:
# Helper function to convert duration_string (like '4:05' or '01:20:15') into seconds
def convert_duration(x):
    try:
        parts = x.split(':')
        parts = [int(p) for p in parts]
        if len(parts) == 2:       # MM:SS
            return parts[0] * 60 + parts[1]
        elif len(parts) == 3:     # HH:MM:SS
            return parts[0] * 3600 + parts[1] * 60 + parts[2]
        else:
            return np.nan
    except:
        return np.nan

df['duration_fixed'] = df['duration_string'].apply(convert_duration)
df['duration'] = df['duration'].replace(0, np.nan)  # if 0 or missing, replace
df['duration'] = df['duration'].fillna(df['duration_fixed']).fillna(0)


In [6]:
df['duration_min'] = df['duration'] / 60
df['length_category'] = pd.cut(df['duration_min'],
                               bins=[0,2,4,6,10,1000],
                               labels=['<2 min','2–4 min','4–6 min','6–10 min','>10 min'])

df['tag_count'] = df['tags'].apply(lambda x: len(x.split('|')) if x!='Unknown' else 0)

df['popularity_score'] = df['view_count'] / (df['channel_follower_count'] + 1)


In [7]:
df.to_csv('../data/youtube_trending_songs_clean.csv', index=False)
print("✅ Cleaned file saved as: youtube_trending_songs_clean.csv")


✅ Cleaned file saved as: youtube_trending_songs_clean.csv
