In [1]:
# import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [2]:
# read in the data
random_channel_df = pd.read_csv('random_channels.csv')
comment_df = pd.read_csv('comment_df.csv')
all_channels = pd.read_csv('all_channels_df.csv')
video_df = pd.read_csv('video_df_cleaned.csv')

In [3]:
random_channel_df.columns

Index(['id', 'title', 'description', 'publishedAt', 'thumbnails.default.url',
       'customUrl', 'defaultLanguage', 'viewCount', 'subscriberCount',
       'videoCount', 'topicIds', 'topicCategories', 'relatedPlaylists.uploads',
       'category_title'],
      dtype='object')

In [4]:
# drop the columns that are not needed
columns_to_drop = ['id', 'title', 'description', 'publishedAt', 'thumbnails.default.url', 'customUrl', 'defaultLanguage', 'topicIds', 'topicCategories', 'relatedPlaylists.uploads']

random_channel_ml = random_channel_df.drop(columns_to_drop, axis=1)
random_channel_ml = random_channel_ml.drop('category_title', axis=1)
all_channels_ml = all_channels.drop(columns_to_drop, axis=1)

In [5]:
categories = all_channels_ml['category_title'].unique()
categories

# create a dictionary to map the categories to numbers
category_dict = {}
for i in range(len(categories)):
    category_dict[categories[i]] = i

# map the categories to numbers
all_channels_ml['category_title'] = all_channels_ml['category_title'].map(category_dict)

all_channels_ml



Unnamed: 0,viewCount,subscriberCount,videoCount,category_title
0,142468175305,146000000,811,0
1,85183017773,102000000,729,0
2,83311249514,102000000,1030,0
3,72222883101,91600000,64797,0
4,69985080921,89200000,498,0
...,...,...,...,...
105,163749185,1460000,91,9
106,151941207,1020000,1266,9
107,126569583,689000,427,9
108,88299661,542000,257,9


In [6]:
# split the data into training and testing
X = all_channels_ml.drop('category_title', axis=1)
y = all_channels_ml['category_title']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the model
rf.fit(X_train, y_train)

# predict the category
y_pred = rf.predict(X_test)

# check the accuracy
print(accuracy_score(y_test, y_pred))

0.6363636363636364


In [8]:
# use kmeans to cluster the channels
from sklearn.cluster import KMeans

# create the model
kmeans = KMeans(n_clusters=10, random_state=42)

# fit the model
kmeans.fit(all_channels_ml)

# predict the cluster
all_channels_ml['cluster'] = kmeans.predict(all_channels_ml)

all_channels_ml

Unnamed: 0,viewCount,subscriberCount,videoCount,category_title,cluster
0,142468175305,146000000,811,0,3
1,85183017773,102000000,729,0,1
2,83311249514,102000000,1030,0,1
3,72222883101,91600000,64797,0,7
4,69985080921,89200000,498,0,7
...,...,...,...,...,...
105,163749185,1460000,91,9,8
106,151941207,1020000,1266,9,8
107,126569583,689000,427,9,8
108,88299661,542000,257,9,8


In [14]:
# plot the clusters on a 3d plot
import plotly.express as px

fig = px.scatter_3d(all_channels, x='viewCount', y='subscriberCount', z='videoCount', color='category_title')
fig.show()