In [19]:
#Importing the correct functions/libraries 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn import set_config
import pandas as pd
import numpy as np


# Try reading the CSV file with different encodings to find the correct one
encodings_to_try = ['utf-8', 'latin1', 'utf-16']
for encoding in encodings_to_try:
    try:
        youtube_data = pd.read_csv('Global_YouTube_Statistics.csv', encoding=encoding)
        break
    except UnicodeDecodeError:
        continue
        
print(youtube_data.shape)
#Displaying the data 
youtube_data


(995, 28)


Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,2.280000e+11,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
1,2,YouTube Movies,170000000,0.000000e+00,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
2,3,MrBeast,166000000,2.836884e+10,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
4,5,SET India,159000000,1.480000e+11,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,991,Natan por Aï¿,12300000,9.029610e+09,Sports,Natan por Aï¿,1200,Brazil,BR,Entertainment,...,700000.0,2017.0,Feb,12.0,51.3,2.125594e+08,12.08,183241641.0,-14.235004,-51.925280
991,992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,Free Fire India Official,1500,India,IN,Games,...,300000.0,2018.0,Sep,14.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
992,993,Panda,12300000,2.214684e+09,,HybridPanda,2452,United Kingdom,GB,Games,...,1000.0,2006.0,Sep,11.0,60.0,6.683440e+07,3.85,55908316.0,55.378051,-3.435973
993,994,RobTopGames,12300000,3.741235e+08,Gaming,RobTopGames,39,Sweden,SE,Games,...,100000.0,2012.0,May,9.0,67.0,1.028545e+07,6.48,9021165.0,60.128161,18.643501


In [20]:
# Display the shape of the DataFrame before unimportant dropping columns
print(youtube_data.shape)

columns_to_drop = ['Title', 'Abbreviation', 'created_month', 'created_date', 'Latitude', 'Longitude']
youtube_data = youtube_data.drop(columns=columns_to_drop)

#Display the truncated data 
youtube_data


(995, 28)


Unnamed: 0,rank,Youtuber,subscribers,video views,category,uploads,Country,channel_type,video_views_rank,country_rank,...,lowest_monthly_earnings,highest_monthly_earnings,lowest_yearly_earnings,highest_yearly_earnings,subscribers_for_last_30_days,created_year,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population
0,1,T-Series,245000000,2.280000e+11,Music,20082,India,Music,1.0,1.0,...,564600.0,9000000.00,6800000.00,1.084000e+08,2000000.0,2006.0,28.1,1.366418e+09,5.36,471031528.0
1,2,YouTube Movies,170000000,0.000000e+00,Film & Animation,1,United States,Games,4055159.0,7670.0,...,0.0,0.05,0.04,5.800000e-01,,2006.0,88.2,3.282395e+08,14.70,270663028.0
2,3,MrBeast,166000000,2.836884e+10,Entertainment,741,United States,Entertainment,48.0,1.0,...,337000.0,5400000.00,4000000.00,6.470000e+07,8000000.0,2012.0,88.2,3.282395e+08,14.70,270663028.0
3,4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,966,United States,Education,2.0,2.0,...,493800.0,7900000.00,5900000.00,9.480000e+07,1000000.0,2006.0,88.2,3.282395e+08,14.70,270663028.0
4,5,SET India,159000000,1.480000e+11,Shows,116536,India,Entertainment,3.0,2.0,...,455900.0,7300000.00,5500000.00,8.750000e+07,1000000.0,2006.0,28.1,1.366418e+09,5.36,471031528.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,991,Natan por Aï¿,12300000,9.029610e+09,Sports,1200,Brazil,Entertainment,525.0,55.0,...,138100.0,2200000.00,1700000.00,2.650000e+07,700000.0,2017.0,51.3,2.125594e+08,12.08,183241641.0
991,992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,1500,India,Games,6141.0,125.0,...,16200.0,258900.00,194200.00,3.100000e+06,300000.0,2018.0,28.1,1.366418e+09,5.36,471031528.0
992,993,Panda,12300000,2.214684e+09,,2452,United Kingdom,Games,129005.0,867.0,...,17.0,268.00,201.00,3.200000e+03,1000.0,2006.0,60.0,6.683440e+07,3.85,55908316.0
993,994,RobTopGames,12300000,3.741235e+08,Gaming,39,Sweden,Games,35112.0,4.0,...,968.0,15500.00,11600.00,1.858000e+05,100000.0,2012.0,67.0,1.028545e+07,6.48,9021165.0


In [22]:
#Pipeline the data so that we can remove duplicate rows and fill empty values 

#Duplicate entries in the code
print("Data Duplicates before transformation pipelines:")
duplicates = youtube_data.duplicated().sum() #Expected 0 
print(duplicates)
#Just in case we do have duplicates 
if(youtube_data.duplicated().sum() > 0): 
    youtube_data.drop_duplicates(inplace=True)


from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Assuming youtube_data is your DataFrame

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Assuming youtube_data is your DataFrame

# Find the number of missing entries in the dataset
missing_values = youtube_data.isna().sum()
print("\nMissing Values Before: ")
print(missing_values)

# Split the dataset into number features and categorical features
num_cols = youtube_data.select_dtypes(include='number').columns.to_list()
cat_cols = youtube_data.select_dtypes(exclude='number').columns.to_list()


# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False))

# Set the estimators
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)], 
                                    remainder='passthrough')

preprocessing


# Running our data through the pipeline
youtube_prepped = preprocessing.fit_transform(youtube_data)

# # Concatenating the feature names for our prepped data

feature_names=preprocessing.get_feature_names_out()
youtube_prepped = pd.DataFrame(data=youtube_prepped, columns=feature_names)


youtube_prepped




# # Double Checking if all missing features have been taken care of 
# missing_values = youtube_prepped.isna().sum()
# print("\nMissing Values After: ")
# missing_values

Data Duplicates before transformation pipelines:
0

Missing Values Before: 
rank                                         0
Youtuber                                     0
subscribers                                  0
video views                                  0
category                                    46
uploads                                      0
Country                                    122
channel_type                                30
video_views_rank                             1
country_rank                               116
channel_type_rank                           33
video_views_for_the_last_30_days            56
lowest_monthly_earnings                      0
highest_monthly_earnings                     0
lowest_yearly_earnings                       0
highest_yearly_earnings                      0
subscribers_for_last_30_days               337
created_year                                 5
Gross tertiary education enrollment (%)    123
Population                     



Unnamed: 0,num__rank,num__subscribers,num__video views,num__uploads,num__video_views_rank,num__country_rank,num__channel_type_rank,num__video_views_for_the_last_30_days,num__lowest_monthly_earnings,num__highest_monthly_earnings,...,cat__channel_type_Entertainment,cat__channel_type_Film,cat__channel_type_Games,cat__channel_type_Howto,cat__channel_type_Music,cat__channel_type_News,cat__channel_type_Nonprofit,cat__channel_type_People,cat__channel_type_Sports,cat__channel_type_Tech
0,-1.730311,12.674193,15.383174,0.319178,-0.407113,-0.332651,-0.389726,5.150913,7.347462,7.325663,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-1.726829,8.392710,-0.782738,-0.269118,2.571529,6.292657,3.494354,-0.434382,-0.513573,-0.513750,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.723348,8.164364,1.228702,-0.247439,-0.407078,-0.332651,-0.389726,2.899974,4.178543,4.189898,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.719866,7.936019,10.845374,-0.240847,-0.407112,-0.331787,-0.389726,4.450896,6.361700,6.367513,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.716385,7.764759,9.710924,3.144908,-0.407111,-0.331787,-0.389203,4.077388,5.834011,5.844885,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,1.716385,-0.609821,-0.142510,-0.233992,-0.406728,-0.286000,-0.300239,0.932291,1.409220,1.402551,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
991,1.719866,-0.609821,-0.664017,-0.225203,-0.402603,-0.225526,-0.354141,-0.274256,-0.288017,-0.288236,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
992,1.723348,-0.609821,-0.625710,-0.197313,-0.312355,0.415493,0.238781,-0.434216,-0.513336,-0.513516,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
993,1.726829,-0.609821,-0.756211,-0.268005,-0.381322,-0.330059,-0.354141,-0.424807,-0.500095,-0.500248,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
