In [15]:
import numpy as np
import requests
import json
import pandas as pd
import openai
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from IPython.display import display

# Import the top YouTuber dataset as the basis of recommendation
YouTube_statistics = pd.read_csv('https://raw.githubusercontent.com/Richahaha/Youtube-Chatbot/main/Global%20YouTube%20Statistics.csv', encoding='ISO-8859-1')

# Display the dataset
YouTube_statistics

Unnamed: 0,rank,Youtuber,subscribers,video views,category,Title,uploads,Country,Abbreviation,channel_type,...,subscribers_for_last_30_days,created_year,created_month,created_date,Gross tertiary education enrollment (%),Population,Unemployment rate,Urban_population,Latitude,Longitude
0,1,T-Series,245000000,2.280000e+11,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
1,2,YouTube Movies,170000000,0.000000e+00,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
2,3,MrBeast,166000000,2.836884e+10,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,1.640000e+11,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,3.282395e+08,14.70,270663028.0,37.090240,-95.712891
4,5,SET India,159000000,1.480000e+11,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
990,991,Natan por Aï¿,12300000,9.029610e+09,Sports,Natan por Aï¿,1200,Brazil,BR,Entertainment,...,700000.0,2017.0,Feb,12.0,51.3,2.125594e+08,12.08,183241641.0,-14.235004,-51.925280
991,992,Free Fire India Official,12300000,1.674410e+09,People & Blogs,Free Fire India Official,1500,India,IN,Games,...,300000.0,2018.0,Sep,14.0,28.1,1.366418e+09,5.36,471031528.0,20.593684,78.962880
992,993,Panda,12300000,2.214684e+09,,HybridPanda,2452,United Kingdom,GB,Games,...,1000.0,2006.0,Sep,11.0,60.0,6.683440e+07,3.85,55908316.0,55.378051,-3.435973
993,994,RobTopGames,12300000,3.741235e+08,Gaming,RobTopGames,39,Sweden,SE,Games,...,100000.0,2012.0,May,9.0,67.0,1.028545e+07,6.48,9021165.0,60.128161,18.643501


In [16]:
# A function to acquire a list of YouTubers with top numbers of subscribers
def get_top_YouTubers():

    try:
        # Ask the users to type the number of recommended YouTubers
        top_YouTubers_input = int(input("\nHow many top YouTubers would you like to see? "))
        
        # Acquire the top YouTuber information
        top_YouTubers = YouTube_statistics.sort_values(by='subscribers', ascending=False).head(top_YouTubers_input)
        
        # Output
        print(f"Here are the top {top_YouTubers_input} YouTubers:")
        print(top_YouTubers[['Youtuber', 'subscribers']].to_string(index=False))
    
    except ValueError:
        
        print("Please enter a valid number.")

    return


In [17]:
# A function to access GPT to search for information about a YouTuber
def fetch_YouTuber_info():

    # Ask the user to enter the name of this YouTube
    YouTuber_name = input("Please enter the name of this YouTuber (paste one from above): \n")

    # Access OpenAI
    openai.api_key ="my_key
    # Use Chatgpt to search for this YouTube
    if YouTuber_name not in YouTube_statistics['Youtuber'].values:

        print("Please enter a valid name")
       
        return
        
    else:     
        completion = openai.chat.completions.create(
            model="gpt-4",
            messages=[
                {
                "role": "user",
                "content": f"What is on {YouTuber_name} YouTube channel about?",
                },
            ],
         )
        print(completion.choices[0].message.content)

    return


In [18]:
# A function to display all the categories in the dataset by subscriber counts
def display_category():

    # Count the occurrences of each category and sort them by frequency
    category_counts = YouTube_statistics['category'].value_counts()

    # Extract the sorted categories based on their frequency
    sorted_categories = category_counts.index.tolist()

    # Display the categories in a table format
    print("\nCategories and their YouTuber counts:")
    
    # Use loop to display all the results
    for index, category in enumerate(sorted_categories, start=1):
        print(f"Top{index}. {category} (Top YouTubers in this category: {category_counts[category]})")


In [19]:
# A function to display all the countries in the dataset by subscriber counts
def display_country():

    # Extract all the countries with their abbreviation
    country_counts = YouTube_statistics.groupby(['Country', 'Abbreviation']).size().reset_index(name='Counts')

    # Sort the results by the counts
    country_counts_sorted = country_counts.sort_values(by='Counts', ascending=False)

    # Output the results
    print("\nCountries and their Youtuber counts:")

    # Display the sorted results
    for index, row in country_counts_sorted.iterrows():
        print(f"{row['Country']} ({row['Abbreviation']}): Top YouTubers in this country: {row['Counts']}")


In [20]:
# A function to let users input requirements to make initial recommendations
def initial_input():

# Ask the users how to start (the options will be added)
  user_start = user_start = str(input("Greetings! I will take you on an amazing journey on YouTube! How would you love to find? Enter the number to start: \n 1. I wanna see the top YouTubers \n 2. Start with hot categories \n 3. Show me the trending content in my country\n  \n ... \n\n "))

  if user_start == '1':

    # Ask the users how many top Youtubers they would like to see
    get_top_YouTubers()  

    # Show the users this number of top Youtubers, make it a loop a ask users until they choose to quit
    while True:
      
      # Ask if the users would like the chatbot to introduce a YouTuber
      YouTube_info_YN = input("Would you like to know more about a YouTuber? (y/n) \n").lower()
              
      if YouTube_info_YN == 'y':
        
        # If YouTuber would like to know, use this function to integrate Chatgpt to introduce the YouTuber
        fetch_YouTuber_info()
              
      elif YouTube_info_YN == 'n':
          
        # Ask the user 
        user_input = input("Which YouTuber would you like to see (paste one from above)? ")

        # Notify users that they have successfully selected
        print(f"\nPerfect! You've choosen {user_input}!")
                  
        break  # Exits the loop if user does not want more info
              
      else:
        
        # Tell users how to do it if they do it wrong
        print("\nPlease enter 'y' for yes or 'n' for no.")

  elif user_start == '2':

    display_category()

    while True:

      # Ask users to select the category
      user_input = input(f'Which category would you like to see? (Paste one from above)')

      if user_input in YouTube_statistics['category'].values:
        
        # Notify users that they have successfully selected
        print(f"\nPerfect! You've choosen {user_input}!")

        break

      else: 
        
        # Tell users how to do it if they do it wrong
        print("\nPlease choose the category from the list.")

  elif user_start == "3":

    # Display all the countries
    display_country()

    # Ask how many videos would be recommended
    while True:

      # Ask users to select their countries
      user_input = input("Which country would you like to find a trend? (just enter the Abbreviation) ")

      if user_input in YouTube_statistics['Abbreviation'].values:

        # Notify users that they have successfully selected
        print((f"\nPerfect! You've choosen {user_input}!"))
        user_input = user_input.lower()

        break

      else: 

        # Tell users how to do it if they do it wrong
        print("\nPlease choose the Country Abbreviation from the list.")

  else:

    # Tell users how to do it if they do it wrong
    print("Invalid input. Please restart and choose a valid option.")

  return user_start, user_input


In [21]:
# A function to make initial recommendations based on the initial input if users choose 1 or 2
def initial_recommendation(user_input):

    while True: 
        
        # Ask users to enter the number of videos they would like to find
        userinput_number = int(input("How many videos would you love to find? (maximum 5 videos once) "))
        
        if userinput_number < 1 or userinput_number > 5:
            
            print("Please enter a valid number of videos.")

        else:

            # Access Youtube V2 Youtube Search API
            url = "https://youtube-v2.p.rapidapi.com/search/"

            querystring = {"query":user_input,"lang":"en","order_by":"this_month","country":"us"}

            headers = {
                "X-RapidAPI-Key": "13a968a89emshe74aff4aa08ce67p11c5d3jsn77d0607b386f",
                "X-RapidAPI-Host": "youtube-v2.p.rapidapi.com"
            }

            response = requests.get(url, headers=headers, params=querystring)

            # Output the recommended videos
            videos = response.json().get('videos', [])[:userinput_number]
			
			# Prepare the list of dictionaries
            video_list = [{"video_id": video['video_id'], "title": video['title']} for video in videos]

			# Display the limited number of videos
            for video in videos:

                print(f"\n Title: {video['title']}, URL: https://www.youtube.com/watch?v={video['video_id']}")
                
            break
            
    return video_list
    

In [22]:
# A function to make initial trending recommendations based on the initial input if users choose 3
def trending_recommendation(user_input):

    while True: 

        # Ask users to enter the number of videos they would like to find
        userinput_number = int(input("How many videos would you love to find? (maximum 5 videos) "))
        
        if userinput_number < 1 or userinput_number > 5:
            
            print("Please enter a valid number of videos.")

        else:

            # Access Youtube V2 Trending Videos API
            url = "https://youtube-v2.p.rapidapi.com/trending/"

            querystring = {"lang":"en","country": user_input,"section":"Now"}

            headers = {
                "X-RapidAPI-Key": "13a968a89emshe74aff4aa08ce67p11c5d3jsn77d0607b386f",
                "X-RapidAPI-Host": "youtube-v2.p.rapidapi.com"
            }

            response = requests.get(url, headers=headers, params=querystring)
            
            # Output the results
            videos = response.json().get('videos', [])[:userinput_number]
			
			# Prepare the list of dictionaries
            video_list = [{"video_id": video['video_id'], "title": video['title']} for video in videos]

			# Display the limited number of videos
            for video in videos:

                print(f"\n Title: {video['title']}, URL: https://www.youtube.com/watch?v={video['video_id']}")
                
            break
            
    return video_list


In [23]:
# A function to allow user to enter flexible cuztomized conditions to rank the initial recommendations
def get_user_conditions():
    
    # Prepare a disctionary
    user_conditions = {}

    # Collect more flexible conditions
    user_condition_input = input("Would you like add other special conditions on the videos (e.g., I want to see more about...), please tell me or enter 'n' to skip.")

    if user_condition_input == 'n':

        # Let the users know they have not added special conditions
        print("\nNo special condition! Roger that!")

    else:

        # Let the users know what special conditions they have added
        print(f"\nYour special conditions are: {user_condition_input}.")

        # Reserve the user condition in the dictionary
        user_conditions['user_condition'] = user_condition_input

        print("\nPerfectly noted them down! Thank you for letting me know!")                

    return user_conditions


In [24]:
# A function to extract detailed information (mainly descriptions) of videos with the video IDs of initial recommendations
def get_videos_detailed_info(video_list):
    
    # Prepare a list
    detailed_video_info = []
    
    for video in video_list:

        # Access video IDs of the videos in the initial recommendations
        video_id = video['video_id']
        
        # Access Youtube V2 Video Details API
        url = "https://youtube-v2.p.rapidapi.com/video/details"
        querystring = {"video_id": video_id}
        headers = {
            "X-RapidAPI-Key": "13a968a89emshe74aff4aa08ce67p11c5d3jsn77d0607b386f",  # Ensure to replace with your actual RapidAPI Key
            "X-RapidAPI-Host": "youtube-v2.p.rapidapi.com"
        }

        # Make the request to the YouTube API
        try:
            response = requests.get(url, headers=headers, params=querystring)
            
            # Use the Youtube V2 Video Details API to access the detailed information of each video in the initial recommendations
            if response.status_code == 200:
                video_details = response.json()
                # Extract 'description' individually from the API response
                description = video_details.get('description', '') 
                detailed_video_info.append({
                    "video_id": video_id,
                    "title": video.get('title', ''),  # Fallback to empty string if not present
                    "description": description,  # Include keywords extracted from the API response
                })
            else:
                print(f"Failed to fetch details for video ID: {video_id}")
        
        except Exception as e:
            print(f"An error occurred while fetching details for video ID: {video_id}. Error: {e}")

    return detailed_video_info


In [25]:
# A function to use chatgpt to introduce the video content by summarizing video description
def get_summary(description):

    # Access to OpenAI
    openai.api_key = "my_key    completion = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": f"Provide summary about this video in 25 words: {description}"},
        ],
    )# Use Chatgpt to individually summarize the description of videos

    return completion.choices[0].message.content 
    

In [26]:
# The final function to rank videos by cosine similarity based on user conditions against video descriptions
def recommendation(user_conditions, detailed_video_info):

    # Prepare the text for comparison: Concatenate keywords and description for each video
    video_texts = [video['description'] for video in detailed_video_info]

    # Vectorize the video texts using TF-IDF
    vectorizer = TfidfVectorizer()
    video_tfidf_matrix = vectorizer.fit_transform(video_texts)

    # Vectorize the user condition using the same vectorizer
    user_condition_vector = vectorizer.transform([user_conditions['user_condition']])
    
    # Calculate cosine similarity between user condition and video texts
    cosine_similarities = cosine_similarity(user_condition_vector, video_tfidf_matrix).flatten()

    # Rank videos based on cosine similarity scores
    sorted_video_indices = cosine_similarities.argsort()[::-1]
    
    # Create a DataFrame for easy handling and adding similarity scores
    video_df = pd.DataFrame(detailed_video_info)
    video_df['cosine_similarity'] = cosine_similarities
    ranked_videos = video_df.iloc[sorted_video_indices]
    
    # Select the relevant columns to return
    recommendations = ranked_videos[['video_id', 'title', 'cosine_similarity']]

    # Adding summaries
    recommendations['summary'] = [get_summary(desc) for desc in video_texts]
    
    # Reset the index to start from 1 and rename it to reflect ranking
    recommendations = recommendations.reset_index(drop=True)  # Drop the old index
    recommendations.index = [f"Top {i+1}" for i in recommendations.index]  # Create a new index starting from "Top 1"

    return recommendations


In [27]:
"""
The main function integrating all functions of this Chatbot
First, Chatbot will let users to initial input what they would to start with: 1. Top YouTubers 2. Categories 3. Trend in their countries
In this process, users will be informed by the ranking by subscriber counts or top YouTuber counts
Then, according to this initial input, Chatbot runs intial_recommendation or trending_recommendation to give a maximum 5 initial recommendations
Then, Chatbot asks if users would like to add special condition of preference in videos, which is a flexible textual message sent to Chatbot
Finally, Chatbot conducts the consine-similarity text analysis to rank the initially recommendeded videos by the similarity of their descirptions with users' special conditions
"""

def main():
    # Display initial options to the user
    user_start, user_input = initial_input()
    
    # Validate initial input and determine the recommendation path
    video_list = []

    if user_start in ['1', '2']:
        video_list = initial_recommendation(user_input)

    elif user_start == '3':
        video_list = trending_recommendation(user_input)

    else:
        print("Invalid option selected. Please restart the process.")
        return  # Exit the function early if an invalid option is provided
    
    # Get additional user conditions for recommendations
    user_conditions = get_user_conditions()
    
    # Fetch detailed video information based on user conditions
    try:
        detailed_video_info = get_videos_detailed_info(video_list)
    except Exception as e:
        print(f"Failed to fetch detailed video information: {e}")
        return  # Exit the function early in case of failure
    
    # Generate recommendations based on the detailed video information and user conditions
    try:
        recommendations = recommendation(user_conditions, detailed_video_info)
    except Exception as e:
        print(f"Failed to generate recommendations: {e}")
        return  # Exit the function early in case of failure
    
    # Display the recommendations to the user
    if not recommendations.empty:
        print("\nConsidering your special conditions, we ranked the recommended videos as follows:\n")
        
        pd.set_option('display.max_colwidth', None)  # Ensure full visibility for video links or descriptions
       
        display(recommendations)
        
        print("\nPlease choose a video and enjoy it!")
        
    else:
        print("\nPlease choose a video and enjoy it!")

# Define or ensure the implementation of required functions mentioned here is correct and complete.

# Remember to replace placeholders with actual implementations.


In [28]:
# Run the main function
if __name__ == "__main__":
    main()

Here are the top 5 YouTubers:
                  Youtuber  subscribers
                  T-Series    245000000
            YouTube Movies    170000000
                   MrBeast    166000000
Cocomelon - Nursery Rhymes    162000000
                 SET India    159000000

Please enter 'y' for yes or 'n' for no.
MrBeast, whose real name is Jimmy Donaldson, is a YouTuber known for his elaborate and often large-scale stunt videos and philanthropy. His content typically involves feats that take a significant amount of time and effort, such as watching a particular video for 24 hours straight, counting to large numbers, or giving away sizable amounts of money or expensive items (like cars) to strangers, friends, or people in need. Many of his videos also feature various challenges and games involving large cash prizes. MrBeast is also the founder of the environmental campaign Team Trees and has used his platform to raise millions for various charitable causes.

Perfect! You've choosen MrBeast

Unnamed: 0,video_id,title,cosine_similarity,summary
Top 1,mKdjycj-7eE,"Stop This Train, Win a Lamborghini",0.091176,"This video summary can't be generated as the content of the video isn't specified. The text includes Western Union promotion, Fortnite creative map link, merch link, and social media handles."
Top 2,VGvj6bj4Sog,Changing the Lives of 600 Strangers,0.07906,"The video is likely promotional or interactive, featuring new merchandise by MrBeast and encouraging viewers to check out Viewstats, subscribe, and follow MrBeast on social media."
Top 3,YupC1dnMuFw,2024 Solar Eclipse Healed my Blind Son...,0.048344,"The video features the Galaxy AI on the Samsung Galaxy S24 Ultra, a Fortnite Creative Map, and a plug for MrBeast merchandise, with thanks to Daniel Schiffer."
Top 4,YrVVXFMgXrw,I Paid A Lie Detector To Investigate My Friends,0.021461,"Lectric has donated $600,000 worth of eBikes and sponsored a video from Beast Philanthropy that encourages viewers to support the Interfaith Refugee Ministry and their philanthropic efforts worldwide."
Top 5,erLbbextvlY,7 Days Stranded On An Island,0.013027,"The creators express gratitude for support over 7 years, and note an unexpected event at the video's end. They promote the FGTEEV graphical novel, guide book, and associated games."
