In [4]:
import pandas as pd

# Step 1: Load the datasets
df_trending = pd.read_csv("../data/trending.csv")
df_non_trending = pd.read_excel("../data/non_trending.xlsx")

# Step 2: Add label columns
df_trending['label'] = 1  # trending videos
df_non_trending['label'] = 0  # non-trending videos

# Step 3: Keep only needed columns and standardize names
df_trending = df_trending[['video_title', 'video_description', 'label']]
df_non_trending = df_non_trending[['title', 'description', 'label']]
df_non_trending.rename(columns={'title': 'video_title', 'description': 'video_description'}, inplace=True)

# Step 4: Combine the two DataFrames
df_combined = pd.concat([df_trending, df_non_trending], ignore_index=True)

# Step 5: Drop rows with missing title/description
df_combined.dropna(subset=['video_title', 'video_description'], inplace=True)

# Step 6: Create a combined 'text' column for model input
df_combined['text'] = df_combined['video_title'] + " " + df_combined['video_description']

# Step 7: Preview the result
print("Combined Dataset Preview:")
print(df_combined.head())

# Optional: Check how many samples you have
print("Total samples:", len(df_combined))
print("Trending videos:", df_combined['label'].sum())
print("Non-trending videos:", len(df_combined) - df_combined['label'].sum())

Combined Dataset Preview:
                                         video_title  \
0                     BTS: Boy with Luv (Live) - SNL   
1          Star Wars: The Rise of Skywalker – Teaser   
2  Gordon Ramsay Enters An Indian Cooking Competi...   
3                         We Got Married...(Pt. 2/4)   
4             BTS Eat Churros on The Morning Mash Up   

                                   video_description  label  \
0  Musical guest BTS performs "Boy with Luv" on S...      1   
1  Every generation has a legend. Watch the brand...      1   
2  As Gordon's trip in Malaysia comes towards an ...      1   
3  The Day i Committed To My Bestfriend!!\n\n\nFO...      1   
4  The Morning Mash Up crew gifted BTS with their...      1   

                                                text  
0  BTS: Boy with Luv (Live) - SNL Musical guest B...  
1  Star Wars: The Rise of Skywalker – Teaser Ever...  
2  Gordon Ramsay Enters An Indian Cooking Competi...  
3  We Got Married...(Pt. 2/4) The Day 

In [6]:
# Save the combined DataFrame to a CSV file
df_combined.to_csv("../data/combined_entertainment_data.csv", index=False)

In [7]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Load the combined CSV (if not already loaded)
df_combined = pd.read_csv("../data/combined_entertainment_data.csv")

# Step 2: Define text cleaning function
def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'@\w+|#', '', text)  # remove @mentions and hashtags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Step 3: Clean the text column
df_combined['clean_text'] = df_combined['text'].apply(clean_text)

# Step 4: Prepare feature and label variables
X_text = df_combined['clean_text']
y = df_combined['label']

# Step 5: Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',       # remove common English stopwords
    max_features=5000,          # limit to top 5000 features
    ngram_range=(1, 2)          # use unigrams and bigrams
)

# Step 6: Fit and transform text into vector format
X = vectorizer.fit_transform(X_text)

# Step 7: Output shapes
print("✅ TF-IDF matrix shape:", X.shape)
print("✅ Labels shape:", y.shape)
# Save the cleaned dataset with clean_text column
df_combined.to_csv("../data/cleaned_entertainment_data.csv", index=False)

✅ TF-IDF matrix shape: (405511, 5000)
✅ Labels shape: (405511,)


In [8]:
print(df_combined['label'].value_counts())

label
1    404464
0      1047
Name: count, dtype: int64


In [None]:
import pandas as pd
import random
import nlpaug.augmenter.word as naw

# Load your cleaned dataset
df = pd.read_csv("../data/cleaned_entertainment_data.csv")

# Check class distribution
label_0 = df[df['label'] == 0]
label_1 = df[df['label'] == 1]

print(f"Label 0 count: {len(label_0)}, Label 1 count: {len(label_1)}")

# Augmenter: Synonym replacement using WordNet
aug = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_max=5)

# Number of synthetic samples to generate
num_augmented = 2000 - len(label_0)

print(f"🔄 Generating {num_augmented} synthetic samples for label=0...")

augmented_texts = []
for i in range(num_augmented):
    original_text = random.choice(label_0["clean_text"].values)
    try:
        augmented = aug.augment(original_text)
        augmented_texts.append(augmented)
    except:
        # In case augmentation fails, just use the original (you can improve this later)
        augmented_texts.append(original_text)

# Create synthetic DataFrame
synthetic_df = pd.DataFrame({
    "video_title": ["synthetic_title"] * num_augmented,
    "video_description": ["synthetic_description"] * num_augmented,
    "text": ["synthetic_text"] * num_augmented,
    "clean_text": augmented_texts,
    "label": [0] * num_augmented
})

# Combine original label=0 and synthetic to make 2,000
combined_label_0 = pd.concat([label_0, synthetic_df], ignore_index=True).sample(n=2000, random_state=42)

# Downsample label=1 to 2000
label_1_downsampled = label_1.sample(n=2000, random_state=42)

# Final balanced dataset
balanced_df = pd.concat([combined_label_0, label_1_downsampled], ignore_index=True).sample(frac=1, random_state=42)

# Save to CSV
balanced_df.to_csv("../data/balanced_data_2000_augmented.csv", index=False)

print(f"✅ Synthetic data added. Final dataset shape: {balanced_df.shape}")
print("✅ Saved as 'balanced_data_2000_augmented.csv'")