In [1]:
#Importing the correct functions/libraries 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn import set_config
import pandas as pd
import numpy as np


# Try reading the CSV file with different encodings to find the correct one
encodings_to_try = ['utf-8', 'latin1', 'utf-16']
for encoding in encodings_to_try:
    try:
        youtube_data = pd.read_csv('Global_YouTube_Statistics.csv', encoding=encoding)
        break
    except UnicodeDecodeError:
        continue
        
print(youtube_data.shape)
#Displaying the data 
youtube_data


(995, 28)


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,2.280000e+11,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
1,2,YouTube Movies,170000000,0.000000e+00,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
2,3,MrBeast,166000000,2.836884e+10,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
4,5,SET India,159000000,1.480000e+11,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,991,Natan por Aï¿,12300000,9.029610e+09,Sports,Natan por Aï¿,1200,Brazil,BR,Entertainment,...,700000.0,2017.0,Feb,12.0,51.3,2.125594e+08,12.08,183241641.0,-14.235004,-51.925280
991,992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,Free Fire India Official,1500,India,IN,Games,...,300000.0,2018.0,Sep,14.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
992,993,Panda,12300000,2.214684e+09,,HybridPanda,2452,United Kingdom,GB,Games,...,1000.0,2006.0,Sep,11.0,60.0,6.683440e+07,3.85,55908316.0,55.378051,-3.435973
993,994,RobTopGames,12300000,3.741235e+08,Gaming,RobTopGames,39,Sweden,SE,Games,...,100000.0,2012.0,May,9.0,67.0,1.028545e+07,6.48,9021165.0,60.128161,18.643501


In [2]:
# Display the shape of the DataFrame before unimportant dropping columns
print(youtube_data.shape)

columns_to_drop = ['Title', 'Abbreviation', 'created_month', 'rank', 'created_date', 'Latitude', 'Longitude', 'Youtuber' ]
youtube_data = youtube_data.drop(columns=columns_to_drop)
#Display the truncated data 
youtube_data


(995, 28)


Unnamed: 0,subscribers,video views,uploads,channel_type_rank,video_views_for_the_last_30_days,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days,created_year,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population
0,245000000,2.280000e+11,20082,1.0,2.258000e+09,6800000.00,1.084000e+08,2000000.0,2006.0,28.1,1.366418e+09,5.36,471031528.0
1,170000000,0.000000e+00,1,7423.0,1.200000e+01,0.04,5.800000e-01,,2006.0,88.2,3.282395e+08,14.70,270663028.0
2,166000000,2.836884e+10,741,1.0,1.348000e+09,4000000.00,6.470000e+07,8000000.0,2012.0,88.2,3.282395e+08,14.70,270663028.0
3,162000000,1.640000e+11,966,1.0,1.975000e+09,5900000.00,9.480000e+07,1000000.0,2006.0,88.2,3.282395e+08,14.70,270663028.0
4,159000000,1.480000e+11,116536,2.0,1.824000e+09,5500000.00,8.750000e+07,1000000.0,2006.0,28.1,1.366418e+09,5.36,471031528.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,12300000,9.029610e+09,1200,172.0,5.525130e+08,1700000.00,2.650000e+07,700000.0,2017.0,51.3,2.125594e+08,12.08,183241641.0
991,12300000,1.674410e+09,1500,69.0,6.473500e+07,194200.00,3.100000e+06,300000.0,2018.0,28.1,1.366418e+09,5.36,471031528.0
992,12300000,2.214684e+09,2452,1202.0,6.703500e+04,201.00,3.200000e+03,1000.0,2006.0,60.0,6.683440e+07,3.85,55908316.0
993,12300000,3.741235e+08,39,69.0,3.871000e+06,11600.00,1.858000e+05,100000.0,2012.0,67.0,1.028545e+07,6.48,9021165.0


In [4]:
#Pipeline the data so that we can remove duplicate rows and fill empty values 

#Duplicate entries in the code
print("Data Duplicates before transformation pipelines:")
duplicates = youtube_data.duplicated().sum() #Expected 0 
print(duplicates)
#Just in case we do have duplicates 
if(youtube_data.duplicated().sum() > 0): 
    youtube_data.drop_duplicates(inplace=True)


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Assuming youtube_data is your DataFrame

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Assuming youtube_data is your DataFrame

# Find the number of missing entries in the dataset
missing_values = youtube_data.isna().sum()
print("\nMissing Values Before: ")
print(missing_values)

# Split the dataset into number features and categorical features
num_cols = youtube_data.select_dtypes(include='number').columns.to_list()
cat_cols = youtube_data.select_dtypes(exclude='number').columns.to_list()
num_cols.remove("subscribers")

# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False))




# Set the estimators
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)], 
                                    remainder='passthrough')

preprocessing







# # Double Checking if all missing features have been taken care of 
# missing_values = youtube_prepped.isna().sum()
# print("\nMissing Values After: ")
# missing_values

Data Duplicates before transformation pipelines:
0

Missing Values Before: 
subscribers                                  0
video views                                  0
uploads                                      0
channel_type_rank                           33
video_views_for_the_last_30_days            56
lowest_yearly_earnings                       0
highest_yearly_earnings                      0
subscribers_for_last_30_days               337
created_year                                 5
Gross tertiary education enrollment (%)    123
Population                                 123
Unemployment rate                          123
Urban_population                           123
dtype: int64


In [5]:
# Running our data through the pipeline
youtube_prepped = preprocessing.fit_transform(youtube_data)

# # # Concatenating the feature names for our prepped data

feature_names=preprocessing.get_feature_names_out()
youtube_prepped = pd.DataFrame(data=youtube_prepped, columns=feature_names)


youtube_prepped

Unnamed: 0,num__video views,num__uploads,num__channel_type_rank,num__video_views_for_the_last_30_days,num__lowest_yearly_earnings,num__highest_yearly_earnings,num__subscribers_for_last_30_days,num__created_year,num__Gross tertiary education enrollment (%),num__Population,num__Unemployment rate,num__Urban_population,remainder__subscribers
0,15.383174,0.319178,-0.389726,5.150913,7.385997,7.347167,3.307009e+00,-1.473769,-1.454504,2.116020,-0.856931,1.705383,245000000.0
1,-0.782738,-0.269118,3.494354,-0.434382,-0.513785,-0.513543,-2.331950e-16,-1.473769,1.005987,-0.230918,1.185215,0.320934,170000000.0
2,1.228702,-0.247439,-0.389726,2.899974,4.133146,4.178227,1.532579e+01,-0.140102,1.005987,-0.230918,1.185215,0.320934,166000000.0
3,10.845374,-0.240847,-0.389726,4.450896,6.340438,6.360952,1.303879e+00,-1.473769,1.005987,-0.230918,1.185215,0.320934,162000000.0
4,9.710924,3.144908,-0.389203,4.077388,5.875745,5.831587,1.303879e+00,-1.473769,-1.454504,2.116020,-0.856931,1.705383,159000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,-0.142510,-0.233992,-0.300239,0.932291,1.461161,1.408125,7.029401e-01,0.971286,-0.504698,-0.492429,0.612364,-0.283106,12300000.0
991,-0.664017,-0.225203,-0.354141,-0.274256,-0.288176,-0.288744,-9.831188e-02,1.193564,-1.454504,2.116020,-0.856931,1.705383,12300000.0
992,-0.625710,-0.197313,0.238781,-0.434216,-0.513551,-0.513311,-6.972478e-01,-1.473769,-0.148520,-0.821859,-1.187085,-1.162918,12300000.0
993,-0.756211,-0.268005,-0.354141,-0.424807,-0.500309,-0.500070,-4.989379e-01,-0.140102,0.138060,-0.949696,-0.612048,-1.486885,12300000.0


In [8]:
#Split the Dataset into 80% for training and 20% for testing
from sklearn.model_selection import train_test_split

X = youtube_prepped.drop(['remainder__subscribers'], axis=1)
y = youtube_prepped['remainder__subscribers']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)



(796, 12) (796,) (199, 12) (199,)
