In [232]:
#Importing the correct functions/libraries 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn import set_config
import pandas as pd
import numpy as np


# Try reading the CSV file with different encodings to find the correct one
encodings_to_try = ['utf-8', 'latin1', 'utf-16']
for encoding in encodings_to_try:
    try:
        youtube_data = pd.read_csv('Global_YouTube_Statistics.csv', encoding=encoding)
        break
    except UnicodeDecodeError:
        continue
        
print(youtube_data.shape)
#Displaying the data 
youtube_data

(995, 28)


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,2.280000e+11,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
1,2,YouTube Movies,170000000,0.000000e+00,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
2,3,MrBeast,166000000,2.836884e+10,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
4,5,SET India,159000000,1.480000e+11,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,991,Natan por Aï¿,12300000,9.029610e+09,Sports,Natan por Aï¿,1200,Brazil,BR,Entertainment,...,700000.0,2017.0,Feb,12.0,51.3,2.125594e+08,12.08,183241641.0,-14.235004,-51.925280
991,992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,Free Fire India Official,1500,India,IN,Games,...,300000.0,2018.0,Sep,14.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
992,993,Panda,12300000,2.214684e+09,,HybridPanda,2452,United Kingdom,GB,Games,...,1000.0,2006.0,Sep,11.0,60.0,6.683440e+07,3.85,55908316.0,55.378051,-3.435973
993,994,RobTopGames,12300000,3.741235e+08,Gaming,RobTopGames,39,Sweden,SE,Games,...,100000.0,2012.0,May,9.0,67.0,1.028545e+07,6.48,9021165.0,60.128161,18.643501


In [234]:
# Display the shape of the DataFrame before unimportant dropping columns
print(youtube_data.shape)

columns_to_drop = ['Title', 'Abbreviation', 'created_month', 'rank', 'created_date', 'Latitude', 'Longitude', 'Youtuber' ]
youtube_data = youtube_data.drop(columns=columns_to_drop)
#Display the truncated data 
youtube_data

(995, 28)


Unnamed: 0,subscribers,video views,category,uploads,Country,channel_type,video_views_rank,country_rank,channel_type_rank,video_views_for_the_last_30_days,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days,created_year,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population
0,245000000,2.280000e+11,Music,20082,India,Music,1.0,1.0,1.0,2.258000e+09,564600.0,9000000.00,6800000.00,1.084000e+08,2000000.0,2006.0,28.1,1.366418e+09,5.36,471031528.0
1,170000000,0.000000e+00,Film & Animation,1,United States,Games,4055159.0,7670.0,7423.0,1.200000e+01,0.0,0.05,0.04,5.800000e-01,,2006.0,88.2,3.282395e+08,14.70,270663028.0
2,166000000,2.836884e+10,Entertainment,741,United States,Entertainment,48.0,1.0,1.0,1.348000e+09,337000.0,5400000.00,4000000.00,6.470000e+07,8000000.0,2012.0,88.2,3.282395e+08,14.70,270663028.0
3,162000000,1.640000e+11,Education,966,United States,Education,2.0,2.0,1.0,1.975000e+09,493800.0,7900000.00,5900000.00,9.480000e+07,1000000.0,2006.0,88.2,3.282395e+08,14.70,270663028.0
4,159000000,1.480000e+11,Shows,116536,India,Entertainment,3.0,2.0,2.0,1.824000e+09,455900.0,7300000.00,5500000.00,8.750000e+07,1000000.0,2006.0,28.1,1.366418e+09,5.36,471031528.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,12300000,9.029610e+09,Sports,1200,Brazil,Entertainment,525.0,55.0,172.0,5.525130e+08,138100.0,2200000.00,1700000.00,2.650000e+07,700000.0,2017.0,51.3,2.125594e+08,12.08,183241641.0
991,12300000,1.674410e+09,People & Blogs,1500,India,Games,6141.0,125.0,69.0,6.473500e+07,16200.0,258900.00,194200.00,3.100000e+06,300000.0,2018.0,28.1,1.366418e+09,5.36,471031528.0
992,12300000,2.214684e+09,,2452,United Kingdom,Games,129005.0,867.0,1202.0,6.703500e+04,17.0,268.00,201.00,3.200000e+03,1000.0,2006.0,60.0,6.683440e+07,3.85,55908316.0
993,12300000,3.741235e+08,Gaming,39,Sweden,Games,35112.0,4.0,69.0,3.871000e+06,968.0,15500.00,11600.00,1.858000e+05,100000.0,2012.0,67.0,1.028545e+07,6.48,9021165.0


In [235]:
#Scale down the subscribers because larger values will be harder to work with and divide by 10,000,000

youtube_data['subscribers_scaled by 10,000,000'] = youtube_data['subscribers'] / 10_000_000
youtube_data.drop('subscribers', axis=1, inplace=True) 
youtube_data.head()


Unnamed: 0,video views,category,uploads,Country,channel_type,video_views_rank,country_rank,channel_type_rank,video_views_for_the_last_30_days,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days,created_year,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,"subscribers_scaled by 10,000,000"
0,228000000000.0,Music,20082,India,Music,1.0,1.0,1.0,2258000000.0,564600.0,9000000.0,6800000.0,108400000.0,2000000.0,2006.0,28.1,1366418000.0,5.36,471031528.0,24.5
1,0.0,Film & Animation,1,United States,Games,4055159.0,7670.0,7423.0,12.0,0.0,0.05,0.04,0.58,,2006.0,88.2,328239500.0,14.7,270663028.0,17.0
2,28368840000.0,Entertainment,741,United States,Entertainment,48.0,1.0,1.0,1348000000.0,337000.0,5400000.0,4000000.0,64700000.0,8000000.0,2012.0,88.2,328239500.0,14.7,270663028.0,16.6
3,164000000000.0,Education,966,United States,Education,2.0,2.0,1.0,1975000000.0,493800.0,7900000.0,5900000.0,94800000.0,1000000.0,2006.0,88.2,328239500.0,14.7,270663028.0,16.2
4,148000000000.0,Shows,116536,India,Entertainment,3.0,2.0,2.0,1824000000.0,455900.0,7300000.0,5500000.0,87500000.0,1000000.0,2006.0,28.1,1366418000.0,5.36,471031528.0,15.9


In [237]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

# Drop duplicates
print("Data Duplicates before dropping:")
duplicates = youtube_data.duplicated().sum()
print(duplicates)
if duplicates > 0:
    youtube_data.drop_duplicates(inplace=True)

# Check for missing values
missing_values = youtube_data.isna().sum()
print("\nMissing Values Before:")
print(missing_values)


# Split the dataset into number features and categorical features
num_cols = youtube_data.select_dtypes(include='number').columns.to_list()
cat_cols = youtube_data.select_dtypes(exclude='number').columns.to_list()


# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False))
num_cols.remove("subscribers_scaled by 10,000,000")




# Set the estimators
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)], 
                                    remainder='passthrough')

preprocessing

Data Duplicates before dropping:
0

Missing Values Before:
video views                                  0
category                                    46
uploads                                      0
Country                                    122
channel_type                                30
video_views_rank                             1
country_rank                               116
channel_type_rank                           33
video_views_for_the_last_30_days            56
lowest_monthly_earnings                      0
highest_monthly_earnings                     0
lowest_yearly_earnings                       0
highest_yearly_earnings                      0
subscribers_for_last_30_days               337
created_year                                 5
Gross tertiary education enrollment (%)    123
Population                                 123
Unemployment rate                          123
Urban_population                           123
subscribers_scaled by 10,000,000             0
d

In [238]:
#Apply the pipeline 

youtube_prepped = preprocessing.fit_transform(youtube_data)

# Scikit-learn strips the column headers, so just add them back on afterward.
feature_names=preprocessing.get_feature_names_out()
youtube_prepped = pd.DataFrame(data=youtube_prepped, columns=feature_names)

youtube_prepped




Unnamed: 0,num__video views,num__uploads,num__video_views_rank,num__country_rank,num__channel_type_rank,num__video_views_for_the_last_30_days,num__lowest_monthly_earnings,num__highest_monthly_earnings,num__lowest_yearly_earnings,num__highest_yearly_earnings,...,cat__channel_type_Film,cat__channel_type_Games,cat__channel_type_Howto,cat__channel_type_Music,cat__channel_type_News,cat__channel_type_Nonprofit,cat__channel_type_People,cat__channel_type_Sports,cat__channel_type_Tech,"remainder__subscribers_scaled by 10,000,000"
0,15.383174,0.319178,-0.407113,-0.332651,-0.389726,5.150913,7.347462,7.325663,7.385997,7.347167,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,24.50
1,-0.782738,-0.269118,2.571529,6.292657,3.494354,-0.434382,-0.513573,-0.513750,-0.513785,-0.513543,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.00
2,1.228702,-0.247439,-0.407078,-0.332651,-0.389726,2.899974,4.178543,4.189898,4.133146,4.178227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.60
3,10.845374,-0.240847,-0.407112,-0.331787,-0.389726,4.450896,6.361700,6.367513,6.340438,6.360952,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.20
4,9.710924,3.144908,-0.407111,-0.331787,-0.389203,4.077388,5.834011,5.844885,5.875745,5.831587,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,-0.142510,-0.233992,-0.406728,-0.286000,-0.300239,0.932291,1.409220,1.402551,1.461161,1.408125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.23
991,-0.664017,-0.225203,-0.402603,-0.225526,-0.354141,-0.274256,-0.288017,-0.288236,-0.288176,-0.288744,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.23
992,-0.625710,-0.197313,-0.312355,0.415493,0.238781,-0.434216,-0.513336,-0.513516,-0.513551,-0.513311,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.23
993,-0.756211,-0.268005,-0.381322,-0.330059,-0.354141,-0.424807,-0.500095,-0.500248,-0.500309,-0.500070,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.23


In [240]:
#Splitting our clean data for testing and training simple linear regression
from sklearn.model_selection import train_test_split

X = youtube_prepped.drop(columns=["remainder__subscribers_scaled by 10,000,000"])
y = youtube_prepped["remainder__subscribers_scaled by 10,000,000"]
#Assigning the split data for testing and training 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(796, 97) (796,) (199, 97) (199,)


In [241]:
from sklearn.linear_model import Ridge, Lasso

RidgeRegression = Ridge(alpha=1)
ridge_model = RidgeRegression.fit(X_train, y_train)


LassoRegression = Lasso(alpha=1)
lasso_model = LassoRegression.fit(X_train, y_train)


In [244]:
#Testing my Ridge and Lasso Regression 

Ridge_y_predict = ridge_model.predict(X_test)
Lasso_y_predict = lasso_model.predict(X_test)

ridge_mse = mse(y_test, Ridge_y_predict)
lasso_mse=mse(y_test, Lasso_y_predict)

print(f'Ridge Regression MSE: {ridge_mse}')
print(f'Lasso Regression MSE: {lasso_mse}')

from sklearn.metrics import mean_absolute_error as mae
lasso_mae = mae(y_test, Ridge_y_predict)
ridge_mae = mae(y_test, Lasso_y_predict)

print(f'Ridge Regression MAE: {ridge_mae}')
print(f'Lasso Regression MAE: {lasso_mae}')

Ridge Regression MSE: 0.7521196489106446
Lasso Regression MSE: 0.935941998750621
Ridge Regression MAE: 0.7135595416950312
Lasso Regression MAE: 0.6287316420342015
