In [20]:
# Install necessary libraries
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn



In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler # Standardization and Normalization
from sklearn.preprocessing import OneHotEncoder, LabelEncoder # Encoding categorical variables
from sklearn.impute import SimpleImputer # Handling missing values
from sklearn.model_selection import train_test_split # Splitting data into training and testing sets
from sklearn.linear_model import LinearRegression # Linear Regression model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # Model evaluation metrics    



In [23]:
df = pd.read_csv('/Users/surya/Documents/AIMI - MINI PROJECT/ML - Youtube Data Analysis/Youtube_ML/USvideos.csv')
video_data = df.copy()
video_data.head()


Unnamed: 0,video_id,date,views,likes,comments,watch_time_minutes,video_length_minutes,subscribers,category,device,country,ad_revenue_usd
0,vid_3092,2024-09-24 10:50:40.993199,9936,1221.0,320.0,26497.214184,2.862137,228086,Entertainment,TV,IN,203.178237
1,vid_3459,2024-09-22 10:50:40.993199,10017,642.0,346.0,15209.747445,23.738069,736015,Gaming,Tablet,CA,140.880508
2,vid_4784,2024-11-21 10:50:40.993199,10097,1979.0,187.0,57332.658498,26.200634,240534,Education,TV,CA,360.134008
3,vid_4078,2025-01-28 10:50:40.993199,10034,1191.0,242.0,31334.517771,11.77034,434482,Entertainment,Mobile,UK,224.638261
4,vid_3522,2025-04-28 10:50:40.993199,9889,1858.0,477.0,15665.666434,6.635854,42030,Education,Mobile,CA,165.514388


In [24]:
#Exploratory Data Analysis (EDA)
# video_data.info()
# video_data.describe()
# video_data.isnull().sum()
video_data.duplicated().sum()
video_data = video_data.drop_duplicates()
video_data.shape
video_data['category'].value_counts()
video_data = video_data.drop(columns=['video_id'])
video_data

df.loc[12444]

video_id                                    vid_33
date                    2024-12-24 10:50:40.993199
views                                        10016
likes                                        752.0
comments                                     364.0
watch_time_minutes                    45740.641484
video_length_minutes                      14.12091
subscribers                                 769084
category                                      Tech
device                                      Mobile
country                                         CA
ad_revenue_usd                          287.014268
Name: 12444, dtype: object

In [25]:
# Encoding categorical variables
# Using OneHotEncoder for 'category_id' and 'channel_title'
#Date Column to datetime
video_data.isnull().sum()
video_data['trending_date'] = video_data['date'].str.split(' ').str[0]
video_data['publish_time'] = video_data['date'].str.split(' ').str[1]
video_data = video_data.drop(columns=['date'])
video_data

Unnamed: 0,views,likes,comments,watch_time_minutes,video_length_minutes,subscribers,category,device,country,ad_revenue_usd,trending_date,publish_time
0,9936,1221.0,320.0,26497.214184,2.862137,228086,Entertainment,TV,IN,203.178237,2024-09-24,10:50:40.993199
1,10017,642.0,346.0,15209.747445,23.738069,736015,Gaming,Tablet,CA,140.880508,2024-09-22,10:50:40.993199
2,10097,1979.0,187.0,57332.658498,26.200634,240534,Education,TV,CA,360.134008,2024-11-21,10:50:40.993199
3,10034,1191.0,242.0,31334.517771,11.770340,434482,Entertainment,Mobile,UK,224.638261,2025-01-28,10:50:40.993199
4,9889,1858.0,477.0,15665.666434,6.635854,42030,Education,Mobile,CA,165.514388,2025-04-28,10:50:40.993199
...,...,...,...,...,...,...,...,...,...,...,...,...
122395,9853,1673.0,147.0,42075.704885,25.490195,210818,Education,Tablet,US,280.986396,2024-12-14,10:50:40.993199
122396,10128,1709.0,63.0,57563.703040,16.229133,878860,Music,Desktop,UK,354.612981,2024-07-13,10:50:40.993199
122397,10267,700.0,,27549.714659,23.822365,576756,Tech,Tablet,CA,203.643106,2024-06-10,10:50:40.993199
122398,10240,1616.0,106.0,56967.384382,7.753099,585138,Music,Mobile,UK,351.525811,2024-12-22,10:50:40.993199


In [26]:
# Data Cleaning and Preprocessing  
# Handle missing values
# Impute missing values if any (for simplicity, using mean for numerical and mode for categorical)
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')  
numerical_cols = video_data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = video_data.select_dtypes(include=['object']).columns
video_data[numerical_cols] = num_imputer.fit_transform(video_data[numerical_cols])
video_data[categorical_cols] = cat_imputer.fit_transform(video_data[categorical_cols])
video_data.isnull().sum()  # Check again for missing values

views                   0
likes                   0
comments                0
watch_time_minutes      0
video_length_minutes    0
subscribers             0
category                0
device                  0
country                 0
ad_revenue_usd          0
trending_date           0
publish_time            0
dtype: int64

In [27]:
video_data['country'].unique()
video_data['category'].groupby(video_data['country']).value_counts()
video_data['trending_date'] = pd.to_datetime(video_data['trending_date'])
video_data['Year'] = video_data['trending_date'].dt.year
video_data['Month'] = video_data['trending_date'].dt.month
video_data['Day'] = video_data['trending_date'].dt.day
video_data['Hour'] = pd.to_datetime(video_data['publish_time']).dt.hour
video_data['Minutes'] = pd.to_datetime(video_data['publish_time']).dt.minute
video_data['Seconds'] = pd.to_datetime(video_data['publish_time']).dt.second
video_data = video_data.drop(columns=['trending_date', 'publish_time'])
video_data['likes'] = video_data['likes'].astype(int)
video_data['views'] = video_data['views'].astype(int)
video_data

Unnamed: 0,views,likes,comments,watch_time_minutes,video_length_minutes,subscribers,category,device,country,ad_revenue_usd,Year,Month,Day,Hour,Minutes,Seconds
0,9936,1221,320.000000,26497.214184,2.862137,228086.0,Entertainment,TV,IN,203.178237,2024,9,24,10,50,40
1,10017,642,346.000000,15209.747445,23.738069,736015.0,Gaming,Tablet,CA,140.880508,2024,9,22,10,50,40
2,10097,1979,187.000000,57332.658498,26.200634,240534.0,Education,TV,CA,360.134008,2024,11,21,10,50,40
3,10034,1191,242.000000,31334.517771,11.770340,434482.0,Entertainment,Mobile,UK,224.638261,2025,1,28,10,50,40
4,9889,1858,477.000000,15665.666434,6.635854,42030.0,Education,Mobile,CA,165.514388,2025,4,28,10,50,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,9853,1673,147.000000,42075.704885,25.490195,210818.0,Education,Tablet,US,280.986396,2024,12,14,10,50,40
122396,10128,1709,63.000000,57563.703040,16.229133,878860.0,Music,Desktop,UK,354.612981,2024,7,13,10,50,40
122397,10267,700,274.349842,27549.714659,23.822365,576756.0,Tech,Tablet,CA,203.643106,2024,6,10,10,50,40
122398,10240,1616,106.000000,56967.384382,7.753099,585138.0,Music,Mobile,UK,351.525811,2024,12,22,10,50,40
