In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/sample_data/youtube_channels_1M_clean.csv", on_bad_lines="skip", engine="python")


In [None]:
df.head(5)

Unnamed: 0,channel_id,channel_link,channel_name,subscriber_count,banner_link,description,keywords,avatar,country,total_views,total_videos,join_date,mean_views_last_30_videos,median_views_last_30_videos,std_views_last_30_videos,videos_per_week
0,UCUMHFa347GD8EIRaVVuxR8Q,/@TonettaClay,Tonetta Clay,781,https://yt3.googleusercontent.com/bj2C0MOj3SMi...,My name is Tonetta or Toni for short and I'm d...,,https://yt3.googleusercontent.com/cUmKB4Zet1SX...,,227567.0,1984.0,2015-12-08,10.866667,6.5,13.922963,1.5
1,UC28mqg7IlYWEhrZwHb72IQA,/@FoodnHappinessVeena,Food 'n' Happiness,0,,"Hello viewers.\n I am Veena from Mangalore, Ka...","food n happiness, food and happiness, food, Fo...",https://yt3.googleusercontent.com/lY_u-8bLWNgA...,India,592961.0,158.0,2020-05-08,1448.066667,1510.0,869.364401,0.0
2,UCoLwWY9zQ7Jp8aDtYUszmYg,/@TimShieff,Tim Shieff,166000,https://yt3.googleusercontent.com/2KC8Lj8RF3uF...,The journey of rediscovery.\n\nhttps://rdscvr.com,"Tim, shieff, timothy, health, human, spiritual...",https://yt3.googleusercontent.com/-aNZLP23AnkX...,United Kingdom,27250763.0,372.0,2006-03-19,5456.933333,3430.0,4322.803149,0.0
3,UCAQOeJwsgBMC74-OjjcQcJA,/@JerryAndJulieMusic,Jerry & Julie Music,1090,https://yt3.googleusercontent.com/dI6Oq0iOhx-c...,Welcome to Jerry & Julie Music. We hope you w...,"jerryandjuliemusic, jerryspianobar, juliesguit...",https://yt3.googleusercontent.com/c41pe3_aA75h...,United States,339906.0,913.0,2010-03-27,75.966667,47.5,113.374448,4.0
4,UCOqwGhI1AmpWwxMYw9D_fqw,/@KichuandYugiMagizhakam,Kichu and Yugi Magizhakam,1160,https://yt3.googleusercontent.com/CXrATV8NKPRg...,Hi friends welcome to my channel...I'm Devi mo...,,https://yt3.googleusercontent.com/FDA5pc1ZQ5jT...,,288328.0,476.0,2023-12-21,85.7,66.0,72.032007,1.25


In [None]:
df.columns

Index(['channel_id', 'channel_link', 'channel_name', 'subscriber_count',
       'banner_link', 'description', 'keywords', 'avatar', 'country',
       'total_views', 'total_videos', 'join_date', 'mean_views_last_30_videos',
       'median_views_last_30_videos', 'std_views_last_30_videos',
       'videos_per_week'],
      dtype='object')

In [None]:
df.isnull().sum()

Unnamed: 0,0
channel_id,0
channel_link,0
channel_name,2
subscriber_count,0
banner_link,23508
description,16236
keywords,39469
avatar,0
country,32574
total_views,83


In [None]:
import re

In [None]:
useful_cols = [
    "channel_id",
    "channel_name",
    "description",
    "keywords",
    "subscriber_count",
    "total_views",
    "total_videos",
    "channel_link"
]

In [None]:

df_useful = df[useful_cols].copy()

In [None]:
df_useful["channel_name"] = df_useful["channel_name"].fillna("")
df_useful["description"] = df_useful["description"].fillna("")
df_useful["keywords"] = df_useful["keywords"].fillna("")


In [None]:
df_useful["text"] = (
    df_useful["channel_name"] + " " +
    df_useful["description"] + " " +
    df_useful["keywords"]
)

In [None]:
def clean_text(text):
  text=text.lower()
  text = re.sub(r"http\S+|www\S+|https\S+", "", text)   # remove links
  text = re.sub(r"[^a-z0-9\s]", " ", text)             # keep only letters/numbers
  return text

In [None]:
df_useful["clean_text"] = df_useful["text"].apply(clean_text)

In [None]:
print(df_useful[["channel_name", "clean_text"]].head(10))
print("Shape after cleaning:", df_useful.shape)

                channel_name  \
0               Tonetta Clay   
1         Food 'n' Happiness   
2                 Tim Shieff   
3        Jerry & Julie Music   
4  Kichu and Yugi Magizhakam   
5             Burhan & Zohan   
6               Jenny taylor   
7           light tajiri bey   
8                   Heal Hub   
9      Yours Health & Beauty   

                                          clean_text  
0  tonetta clay my name is tonetta or toni for sh...  
1  food  n  happiness hello viewers \n i am veena...  
2  tim shieff the journey of rediscovery \n\n tim...  
3  jerry   julie music welcome to jerry   julie m...  
4  kichu and yugi magizhakam hi friends welcome t...  
5  burhan   zohan welcome to my channel        hi...  
6  jenny taylor let s get healthy \n\nwelcome to ...  
7  light tajiri bey my sovereigne bourne appellat...  
8            heal hub  health  heal hub  good health  
9  yours health   beauty hey beautiful and specia...  
Shape after cleaning: (81801, 10)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df_useful["clean_text"])


In [None]:
def recommend_channels(query,top_k=10):
   query_clean = clean_text(query)
   query_vec = vectorizer.transform([query_clean])
   similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
   top_indices = similarity.argsort()[::-1][:top_k]
   results = df_useful.iloc[top_indices][[
        "channel_name",
        "subscriber_count",
        "total_views",
        "channel_link"
    ]].copy()
   results["similarity"] = similarity[top_indices]

   return results
print(recommend_channels("", top_k=5))


        channel_name  subscriber_count  total_views        channel_link  \
51507       Cookbook                 0      13500.0       /@CookBook224   
81252  MOBULA GAMING               227      46020.0      /@MOBULAGAMING   
73164  GAMEING POINT               130      34825.0  /@gameingpoint8410   
70109     hepla meta              1210      18806.0   /@snapsgaming3378   
26136   PUBG Esports            222000   99541154.0       /@PUBGEsports   

       similarity  
51507    0.735156  
81252    0.726500  
73164    0.640161  
70109    0.596283  
26136    0.525596  


In [None]:
# Block 4: Show more metadata with recommendations

def recommend_channels(query, df, top_k=10):
    # Clean and vectorize the query
    query_clean = clean_text(query)
    query_vec = vectorizer.transform([query_clean])

    # Compute similarity
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarity.argsort()[::-1][:top_k]

    # Collect recommendations with metadata
    results = []
    for idx in top_indices:
        channel_info = {
            "Channel": df.iloc[idx]["channel_name"],
            "Description": df.iloc[idx]["description"],
            "Subscribers": df.iloc[idx]["Subscribers"] if "Subscribers" in df.columns else "N/A",
            "Views": df.iloc[idx]["total_views"] if "Views" in df.columns else "N/A",
            "Videos": df.iloc[idx]["total_videos"] if "Videos" in df.columns else "N/A",
            "Score": round(similarity[idx], 3)
        }
        results.append(channel_info)

    return results


# ✅ Example run
recommendations = recommend_channels("python programming", df, top_k=5)
for rec in recommendations:
    print(rec)


{'Channel': 'Proz Core', 'Description': 'Python Tutorials for Beginners\n\n\n', 'Subscribers': 'N/A', 'Views': 'N/A', 'Videos': 'N/A', 'Score': np.float64(0.805)}
{'Channel': 'TechChaitu Programmer', 'Description': 'watch my python tutorials on\n\nhttps://www.udemy.com/course/learn-python-from-scratch-t/\n\nhttps://www.udemy.com/course/complete-html-css-tutorial-with-project-interview-qa/\n\nhttps://www.udemy.com/course/mysql-jc/\n\nhttps://www.youtube.com/channel/UChFUVlYlwBps2gUXvxkOX_w\n\nhttps://twitter.com/home\n\n', 'Subscribers': 'N/A', 'Views': 'N/A', 'Videos': 'N/A', 'Score': np.float64(0.767)}
{'Channel': 'Python Bites', 'Description': 'Welcome to our channel "Python Bites" , a channel for mastering the art of writing concise and most readable python code. Here you can get one-liner solutions for basic as well as advanced programming problems, tips and tricks to streamline your code, and insights about how to write more efficient and effective programs. \n\nJoin our community

In [None]:

def show_recommendations(query, df, top_k=10):
    results = recommend_channels(query, df, top_k=top_k)

    # Convert results list of dicts into DataFrame
    results_df = pd.DataFrame(results)

    print("\n🔎 Top Recommendations for:", query)
    print(results_df.to_string(index=False))  # Pretty print

    return results_df


# ✅ Example run
show_recommendations("python programming", df, top_k=5)


🔎 Top Recommendations for: python programming
               Channel                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Description Subscribers Views Videos  Score
             Proz Core                                                                                                                                                                                                                                                 

Unnamed: 0,Channel,Description,Subscribers,Views,Videos,Score
0,Proz Core,Python Tutorials for Beginners\n\n\n,,,,0.805
1,TechChaitu Programmer,watch my python tutorials on\n\nhttps://www.ud...,,,,0.767
2,Python Bites,"Welcome to our channel ""Python Bites"" , a chan...",,,,0.752
3,Python基本情報&アプリ開発,このチャンネルでは、Pythonの基本コードやシステム開発を中心に動画をアップさせていきます！,,,,0.751
4,Python Developer - 0.1,Welcome To My YouTube Channel\n\nThis channel ...,,,,0.743


In [None]:


def show_sorted_recommendations(query, df, sort_by="score", top_k=10, ascending=False):
    results = recommend_channels(query, df, top_k=top_k)
    results_df = pd.DataFrame(results)

    if sort_by.lower() in results_df.columns.str.lower():
        # Match column case-insensitive
        col_name = [col for col in results_df.columns if col.lower() == sort_by.lower()][0]
        results_df = results_df.sort_values(by=col_name, ascending=ascending)
    else:
        print(f"⚠️ Column '{sort_by}' not found. Using default 'Score'.")
        results_df = results_df.sort_values(by="Score", ascending=False)

    print(f"\n📊 Top Recommendations for '{query}' (sorted by {sort_by}):")
    print(results_df.to_string(index=False))
    return results_df


# ✅ Example runs
show_sorted_recommendations("python programming", df, sort_by="Subscribers", top_k=5)
show_sorted_recommendations("python programming", df, sort_by="Views", top_k=5)
show_sorted_recommendations("python programming", df, sort_by="Score", top_k=5)



📊 Top Recommendations for 'python programming' (sorted by Subscribers):
               Channel                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Description Subscribers Views Videos  Score
             Proz Core                                                                                                                                                                                                                       

Unnamed: 0,Channel,Description,Subscribers,Views,Videos,Score
0,Proz Core,Python Tutorials for Beginners\n\n\n,,,,0.805
1,TechChaitu Programmer,watch my python tutorials on\n\nhttps://www.ud...,,,,0.767
2,Python Bites,"Welcome to our channel ""Python Bites"" , a chan...",,,,0.752
3,Python基本情報&アプリ開発,このチャンネルでは、Pythonの基本コードやシステム開発を中心に動画をアップさせていきます！,,,,0.751
4,Python Developer - 0.1,Welcome To My YouTube Channel\n\nThis channel ...,,,,0.743


In [None]:
import pickle

# Save the trained model components
with open("youtube_recommendation_model.pkl", "wb") as f:
    pickle.dump({
        "vectorizer": vectorizer,
        "tfidf_matrix": tfidf_matrix,
        "dataframe": df  # also store dataset for later use
    }, f)

print("✅ Model saved as youtube_recommendation_model.pkl")


✅ Model saved as youtube_recommendation_model.pkl


# New Section

In [None]:
df.columns

Index(['channel_id', 'channel_link', 'channel_name', 'subscriber_count',
       'banner_link', 'description', 'keywords', 'avatar', 'country',
       'total_views', 'total_videos', 'join_date', 'mean_views_last_30_videos',
       'median_views_last_30_videos', 'std_views_last_30_videos',
       'videos_per_week'],
      dtype='object')