# Data Preprocessing

In this notebook, I loaded the synthetic video metadata dataset, clean missing values, encode categorical features, perform train-test split, and save the processed dataset. These steps prepare the dataset for model training.


In [4]:
# Importing required libraries for data cleaning and preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [5]:
# Loading the synthetic dataset generated earlier
data_path = "../data/raw/synthetic_videos.csv"
df = pd.read_csv(data_path)

# Displaying first few rows to verify dataset structure
df.head()


Unnamed: 0,duration,views,likes,comments,engagement_level
0,107,864410,66724,3768,High
1,440,274483,22623,3115,High
2,275,833154,63556,9591,High
3,111,571053,53267,3036,High
4,76,818806,41116,11894,Medium


In [6]:
# Checking missing values to understand data quality
df.isnull().sum()


duration            0
views               0
likes               0
comments            0
engagement_level    0
dtype: int64

In [7]:
# Filling numeric missing values with mean, categorical with mode
df['duration'] = df['duration'].fillna(df['duration'].mean())
df['views'] = df['views'].fillna(df['views'].mean())
df['likes'] = df['likes'].fillna(df['likes'].mean())
df['comments'] = df['comments'].fillna(df['comments'].mean())

df['engagement_level'] = df['engagement_level'].fillna(
    df['engagement_level'].mode()[0]
)

df.isnull().sum()


duration            0
views               0
likes               0
comments            0
engagement_level    0
dtype: int64

In [8]:
# Encoding categorical column 'engagement_level' into numeric format
label_encoder = LabelEncoder()
df['engagement_label'] = label_encoder.fit_transform(df['engagement_level'])

df[['engagement_level', 'engagement_label']].head()


Unnamed: 0,engagement_level,engagement_label
0,High,0
1,High,0
2,High,0
3,High,0
4,Medium,2


In [9]:
# Splitting data into features and target variable
X = df[['duration', 'views', 'likes', 'comments']]
y = df['engagement_label']

# Using 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Checking shapes
X_train.shape, X_test.shape


((8000, 4), (2000, 4))

In [11]:
# Saving the train-test data inside processed folder
df.to_csv("../data/processed/cleaned_full_dataset.csv", index=False)
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

print("Processed data saved successfully ")


Processed data saved successfully 
