In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
df = pd.read_csv('youtubers_df_cleaned.csv')
df = df.reset_index()
df.head()

Unnamed: 0,index,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links
0,0,tseries,Music and Dance,249500000.0,India,86200.0,2700.0,78.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...
1,1,MrBeast,"Video Games, Humor",183500000.0,United States,117400000.0,5300000.0,18500.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...
2,2,CoComelon,Education,165500000.0,Unknown,7000000.0,24700.0,0.0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...
3,3,SETIndia,Unknown,162600000.0,India,15600.0,166.0,9.0,http://youtube.com/channel/UCpEhnqL0y41EpW2TvW...
4,4,KidsDianaShow,"Animation, Toys",113500000.0,Unknown,3900000.0,12400.0,0.0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...


In [23]:
# Combine important columns
def combine_features(data):
    features = []
    for i in range(0, data.shape[0]):
        features.append(data["Categories"][i] + " " + data["Country"][i])
        print(features[i])
    return features
    

In [24]:
df["combined_features"] = combine_features(df)
df

Music and Dance India
Video Games, Humor United States
Education Unknown
Unknown India
Animation, Toys Unknown
Movies, Video Games United States
Toys Unknown
Animation, Toys Unknown
Music and Dance India
Video Games United States
Music and Dance United States
Unknown India
Movies, Animation India
Movies India
Music and Dance India
Music and Dance United States
Unknown India
Animation, Toys Unknown
Toys Unknown
Music and Dance Brazil
Unknown India
Music and Dance India
Music and Dance India
Music and Dance India
Movies, Animation United States
Music and Dance Unknown
Video Games United States
News and Politics India
Music and Dance Unknown
Music and Dance India
Music and Dance United States
Music and Dance India
Education Unknown
Music and Dance United States
Music and Dance United States
Music and Dance United States
Toys Unknown
Music and Dance United States
Unknown India
Movies, Animation Mexico
Music and Dance United States
Unknown Unknown
Music and Dance India
Animation, Humor Russ

Unnamed: 0,index,Username,Categories,Suscribers,Country,Visits,Likes,Comments,Links,combined_features
0,0,tseries,Music and Dance,249500000.0,India,86200.0,2700.0,78.0,http://youtube.com/channel/UCq-Fj5jknLsUf-MWSy...,Music and Dance India
1,1,MrBeast,"Video Games, Humor",183500000.0,United States,117400000.0,5300000.0,18500.0,http://youtube.com/channel/UCX6OQ3DkcsbYNE6H8u...,"Video Games, Humor United States"
2,2,CoComelon,Education,165500000.0,Unknown,7000000.0,24700.0,0.0,http://youtube.com/channel/UCbCmjCuTUZos6Inko4...,Education Unknown
3,3,SETIndia,Unknown,162600000.0,India,15600.0,166.0,9.0,http://youtube.com/channel/UCpEhnqL0y41EpW2TvW...,Unknown India
4,4,KidsDianaShow,"Animation, Toys",113500000.0,Unknown,3900000.0,12400.0,0.0,http://youtube.com/channel/UCk8GzjMOrta8yxDcKf...,"Animation, Toys Unknown"
...,...,...,...,...,...,...,...,...,...,...
992,992,hamzymukbang,Unknown,11700000.0,United States,397400.0,14000.0,124.0,http://youtube.com/channel/UCPKNKldggioffXPkSm...,Unknown United States
993,993,Adaahqueen,Unknown,11700000.0,India,1100000.0,92500.0,164.0,http://youtube.com/channel/UCk3fFpqI5kDMf__mUP...,Unknown India
994,994,LittleAngelIndonesia,Music and Dance,11700000.0,Unknown,211400.0,745.0,0.0,http://youtube.com/channel/UCdrHrQf0o0TO8YDntX...,Music and Dance Unknown
995,995,PenMultiplex,Unknown,11700000.0,India,14000.0,81.0,1.0,http://youtube.com/channel/UCObyBrdrtQ20BU9PxH...,Unknown India


In [25]:
# Convert text from the new column to a matrix of word counts
cm = CountVectorizer().fit_transform(df["combined_features"])

In [26]:
# Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
print(cs)

[[1.         0.         0.         ... 0.75       0.35355339 0.5       ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.35355339 0.5        0.        ]
 ...
 [0.75       0.         0.35355339 ... 1.         0.35355339 0.25      ]
 [0.35355339 0.         0.5        ... 0.35355339 1.         0.35355339]
 [0.5        0.         0.         ... 0.25       0.35355339 1.        ]]


In [27]:
# Get a genre a user likes of the book the reader likes
youtuber = df["Username"][81]
youtuber

'LikeNastya_ESP'

In [28]:
# Find the index of the YouTuber the user likes
ytuber_id = df[df.Username == youtuber]["index"].values[0]
ytuber_id

81

In [29]:
# Create a list of tuples in the form (username, similarity score)
scores = list(enumerate(cs[ytuber_id]))
print(scores)

[(0, 0.0), (1, 0.0), (2, 0.7071067811865475), (3, 0.7071067811865475), (4, 0.5773502691896258), (5, 0.0), (6, 0.7071067811865475), (7, 0.5773502691896258), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.7071067811865475), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.7071067811865475), (17, 0.5773502691896258), (18, 0.7071067811865475), (19, 0.0), (20, 0.7071067811865475), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.5), (26, 0.0), (27, 0.0), (28, 0.5), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.7071067811865475), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.7071067811865475), (37, 0.0), (38, 0.7071067811865475), (39, 0.0), (40, 0.0), (41, 1.0), (42, 0.0), (43, 0.0), (44, 0.7071067811865475), (45, 0.0), (46, 1.0), (47, 0.7071067811865475), (48, 0.7071067811865475), (49, 0.7071067811865475), (50, 0.7071067811865475), (51, 0.5773502691896258), (52, 0.5), (53, 0.0), (54, 0.0), (55, 1.0), (56, 0.0), (57, 0.5773502691896258), (58, 0.0), (59, 0.5773502691896258), (60, 0.0), (61, 0.5773502691896258

In [30]:
# Sort the list of similar books in descending order

sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]
sorted_scores

[(46, 1.0),
 (55, 1.0),
 (81, 1.0),
 (141, 1.0),
 (161, 1.0),
 (223, 1.0),
 (256, 1.0),
 (264, 1.0),
 (268, 1.0),
 (279, 1.0),
 (348, 1.0),
 (373, 1.0),
 (377, 1.0),
 (385, 1.0),
 (440, 1.0),
 (495, 1.0),
 (550, 1.0),
 (554, 1.0),
 (593, 1.0),
 (648, 1.0),
 (659, 1.0),
 (718, 1.0),
 (725, 1.0),
 (795, 1.0),
 (829, 1.0),
 (831, 1.0),
 (850, 1.0),
 (877, 1.0),
 (879, 1.0),
 (891, 1.0),
 (895, 1.0),
 (967, 1.0),
 (968, 1.0),
 (969, 1.0),
 (2, 0.7071067811865475),
 (3, 0.7071067811865475),
 (6, 0.7071067811865475),
 (11, 0.7071067811865475),
 (16, 0.7071067811865475),
 (18, 0.7071067811865475),
 (20, 0.7071067811865475),
 (32, 0.7071067811865475),
 (36, 0.7071067811865475),
 (38, 0.7071067811865475),
 (44, 0.7071067811865475),
 (47, 0.7071067811865475),
 (48, 0.7071067811865475),
 (49, 0.7071067811865475),
 (50, 0.7071067811865475),
 (64, 0.7071067811865475),
 (72, 0.7071067811865475),
 (73, 0.7071067811865475),
 (82, 0.7071067811865475),
 (83, 0.7071067811865475),
 (87, 0.7071067811865475

In [31]:
def recommend(account):
    recommendations = []
    # Find the index of the YouTuber the user likes
    ytuber_id = df[df.Username == account]["index"].values[0]

    # Create a list of tuples in the form (username, similarity score)
    scores = list(enumerate(cs[ytuber_id]))

    # Sort the list of similar books in descending order
    sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
    sorted_scores = sorted_scores[1:]

    # Exclude the account the user picked from the list of similar accounts
    for i in sorted_scores:
        if i[0] == ytuber_id:
            print(i)
            sorted_scores.remove(i)
    
    # Create a loop to print the first 5 most similar accounts
    print(f"The 5 most similar accounts to '{account}' are:\n ")
    
    j = 0
    for item in sorted_scores:
        accounts = df[df['index'] == item[0]]["Username"].values[0]
        recommendations.append(accounts)
        j = j+1
        if j >= 5:
            break

    return recommendations

In [32]:
for count, account in enumerate(recommend(df.Username[1])):
    print(str(count+1) + '. ' + account)

The 5 most similar accounts to 'MrBeast' are:
 
1. brentrivera
2. PrestonYT
3. rug
4. StokesTwins
5. BenAzelart
