In [1]:
import pandas as pd
import numpy as np

import string
import glob
import pyyoutube
import time
from youtube_transcript_api import YouTubeTranscriptApi

from collections import defaultdict
from tqdm import notebook
from tqdm.auto import tqdm

from deep_translator import GoogleTranslator
from langdetect import detect

from nltk.stem.porter import *
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize

## Setup

In [2]:
keys = ["AIzaSyBHonuF98PzbYOg7Z1ZFnlAaHjl0Gh3Kjg",  
        "AIzaSyDvaaNTMomMcvGwcz-TrvdrgTlvk4TDAeg", 
        "AIzaSyDvD8rnCKonVOnWAtZCfAu22svlgY9dsuU",
        "AIzaSyA1tCsmnGtTrNLDW_SKyWkArihc3o-bCho",
        "AIzaSyDvk4LR8GYYEMtuKwCQWcVWgaBnY2ftW8A"] #roan.schellingerhout, xiodazer, roanspaypal, ronzijnsmurf, veilen

In [3]:
api = pyyoutube.Api(api_key=keys[0])

In [4]:
def update_key(api, key_list):
    current = key_list.index(api._api_key)
    print("Updating API key...")
    
    while current < len(key_list):
        api = pyyoutube.Api(api_key=key_list[current])
        
        try: # see if this key is functional
            api.get_channel_info(channel_id="UC0aanx5rpr7D1M7KCFYzrLQ")
            return api
        except: # if it's not, try the next one
            current += 1
    
    # if no key was functional, exit
    print("No keys remaining...")
    return None

## Load data

In [5]:
path = r'C:/Users/Roan/Documents/bachelor_scriptie/Results/strat_4' # use your path
all_files = glob.glob(path + "/rec*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=0, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [6]:
frame.tail()

Unnamed: 0,user,vids_watched,video,channel
1355,scriptiebot20@gmail.com,15,https://www.youtube.com/watch?v=JT0wx27J9xs,https://www.youtube.com/c/smartereveryday
1356,scriptiebot20@gmail.com,15,https://www.youtube.com/watch?v=H5UUr9RXfTY,https://www.youtube.com/c/MentourPilotaviation
1357,scriptiebot20@gmail.com,15,https://www.youtube.com/watch?v=nG33yGMJaiY,https://www.youtube.com/c/MatthewPosa
1358,scriptiebot20@gmail.com,15,https://www.youtube.com/watch?v=EQAVZpbtCLo,https://www.youtube.com/channel/UCueOZEvMffxjJ...
1359,scriptiebot20@gmail.com,15,https://www.youtube.com/watch?v=sMowoWNX8iY,https://www.youtube.com/c/physicsgirl


In [7]:
watched_ids = frame["video"].apply(lambda x: x.split("v=")[1])

In [8]:
frame["video_id"] = watched_ids.apply(lambda x: x.split("&list")[0])

## Download titles, descriptions, etc.

In [9]:
matrix = defaultdict(list)

for vid in notebook.tqdm(frame.itertuples(), total=len(frame)):
    # Video information
    try:
        dct = api.get_video_by_id(video_id = vid[-1]).items[0].to_dict()
    except Exception as e:
        if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
            api = update_key(api, keys)
            dct = api.get_video_by_id(video_id = vid[-1]).items[0].to_dict()

    try:
        chnl = api.get_channel_info(channel_id=dct["snippet"]["channelId"]).items[0].to_dict()
    except Exception as e:
        if "quota" in str(e): # if the error was caused due to the quota-limit, refresh the key
            api = update_key(api, keys)
            chnl = api.get_channel_info(channel_id=dct["snippet"]["channelId"]).items[0].to_dict()

    # Split
    snippet = dct["snippet"]
    stats = dct["statistics"]

    matrix["user"].append(vid[1])
    matrix["vids_watched"].append(vid[2])
    matrix["vid"].append(vid[-1])

    # Text
    matrix["title"].append(snippet["title"])
    matrix["description"].append(snippet["description"])

    langs = ("en", "nl", "af", "sq", "de", "am", "ar","hy","az","eu","be","bn","my","bs","bg",
             "ca","ceb","zh-Hant","zh-Hans","co","da","en","eo","et","fil","fi","fr","fy","gl",
             "ka","el","gu","ht","ha","haw","iw","hi","hmn","hu","ga","ig","is","id","it","ja",
             "jv","yi","kn","kk","km","rw","ky","ku","ko","hr","lo","la","lv","lt","lb","mk","mg",
             "ml","ms","mt","mi","mr","mn","ne","no","ny","or","ug","uk","uz","ps","fa","pl","pt",
             "pa","ro","ru","sm","gd","sr","sn","sd","si","sl","sk","su","so","es","sw","tg","ta",
             "tt","te","th","cs","tk","tr","ur","vi","cy","xh","yo","zu","st","sv")

    try:
        transcript = " ".join([i["text"] for i in YouTubeTranscriptApi.get_transcript(vid[-1], languages=langs)])
    except Exception as e:
        transcript = ""

    matrix["transcript"].append(transcript)

    # Statistics
    matrix["views"].append(stats["viewCount"])
    matrix["likes"].append(stats["likeCount"])
    matrix["dislikes"].append(stats["dislikeCount"])
    matrix["duration"].append(dct["contentDetails"]["duration"])

    # Channel information
    matrix["channel_description"].append(chnl["snippet"]["description"])
    matrix["channel_keywords"].append(chnl["brandingSettings"]["channel"]["keywords"])
    matrix["channel"].append(chnl["id"])

pd.DataFrame(matrix)

  0%|          | 0/1360 [00:00<?, ?it/s]



Unnamed: 0,user,vids_watched,vid,title,description,transcript,views,likes,dislikes,duration,channel_description,channel_keywords,channel
0,scriptiebot16a@gmail.com,1,N8JLfT0r058,Wreckingball from helicopter used to remove un...,The film shows how the Norwegian Public Road A...,,1062896,3452,213,PT4M,,,UCiP3zWFVsqM9zXl61JZ_tCw
1,scriptiebot16a@gmail.com,1,Iha2SxMZvWY,NEDERLANDERS STELEN DE SHOW! 🤩🇳🇱 | Brugge vs A...,Goals en highlights van Club Brugge vs Anderle...,van harte welkom vanuit het jan breydelstadion...,87928,1769,27,PT8M10S,"Je blijft juichen! \n\nDe Premier League, La L...","""Ziggo Sport"" Ziggo ""Formule 1"" ""Max Verstappe...",UCueOZEvMffxjJyVO1fGiJNQ
2,scriptiebot16a@gmail.com,1,VbAEO6La7i4,Find Schism206 Content - Old & New,Find Older Schism206 Videos/Playlists:\nWebsit...,greetings regardless if you are new to this ch...,2081,,,PT5M51S,Updates (Subscribe for ALL Notifications): htt...,"Occult Esoteric Conspiracy ""New World Order""",UCLi_ZKbpFFBT5Q1_FzUqh4g
3,scriptiebot16a@gmail.com,1,hpXCachboOw,The Worlds Longest Indoor Mountain Bike Trail,In the suburbs of Cleveland Ohio you can find ...,this past weekend april and i got the chance t...,1970742,31981,744,PT5M55S,This channel is here to help people get stoked...,"""mountain biking"" ""how to"" ""mountain bikes"" ""k...",UC4eegkSVzV56kTrSpvL6BKQ
4,scriptiebot16a@gmail.com,1,h4T_LlK1VE4,Glitterbomb 3.0 vs. Porch Pirates,Merry Christmas ya filthy animals. If you wan...,this guy stole my package from a porch and he'...,36909540,1712505,20173,PT22M11S,Former NASA and Apple engineer. Current YouTu...,DIY Creative Design Hack Hacker Halloween cost...,UCY1kMZp36IQSyNx_9h4mpCg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355,scriptiebot20@gmail.com,15,JT0wx27J9xs,"How Hard Can You Hit a Golf Ball? (at 100,000 ...",Make an awesome website today at: https://wix....,"- Three, two, one. Oh, wow, that was loud! - T...",19808013,349309,11814,PT16M13S,I explore the world using science. That's pre...,"Smarter Every Day Science Education ""How To""",UC6107grRI4m0o2-emgoDnAA
1356,scriptiebot20@gmail.com,15,H5UUr9RXfTY,One of the most AMAZING aviation stories ever ...,Go to https://curiositystream.thld.co/mentourp...,this video is brought to you together with cur...,236152,13077,134,PT31M13S,Hi! \nOn my channel I will give you my point o...,"Pilot aviation ""aviation facts"" ""Boeing 737"" ""...",UCwpHKudUkP5tNgmMdexB3ow
1357,scriptiebot20@gmail.com,15,nG33yGMJaiY,Hot Tenting in the Rain With My Dogs,"I head out with my two dogs, Monty and Rueger,...",so [Music] meat [Music] look at a big old regi...,98012,6328,83,PT1H23M25S,"I love to camp, fish, hike, hunt, adventure an...",camping wilderness outdoors outdoorsman canoei...,UCF8HpP-lEx8W9OlMSOW6kGA
1358,scriptiebot20@gmail.com,15,EQAVZpbtCLo,Formule 1 Café 30/04/2021 | Met Robert Doornbo...,Tijdens het Formule 1-seizoen hebben we het ov...,een hele goede avond voor mij is half 11 tijd ...,46088,549,57,PT1H2M52S,"Je blijft juichen! \n\nDe Premier League, La L...","""Ziggo Sport"" Ziggo ""Formule 1"" ""Max Verstappe...",UCueOZEvMffxjJyVO1fGiJNQ


In [10]:
recs = pd.DataFrame(matrix)

## Clean data

In [11]:
# Vul lege cellen en lowercase
recs["transcript"].fillna("", inplace = True)
recs["title"].fillna("", inplace = True)
recs["description"].fillna("", inplace = True)
recs["channel_keywords"].fillna("", inplace = True)
recs["channel_description"].fillna("", inplace = True)

recs["transcript"] = recs["transcript"].str.lower()
recs["title"] = recs["title"].str.lower()
recs["description"] = recs["description"].str.lower()
recs["channel_keywords"] = recs["channel_keywords"].str.lower()
recs["channel_description"] = recs["channel_description"].str.lower()

In [12]:
recs.head()

Unnamed: 0,user,vids_watched,vid,title,description,transcript,views,likes,dislikes,duration,channel_description,channel_keywords,channel
0,scriptiebot16a@gmail.com,1,N8JLfT0r058,wreckingball from helicopter used to remove un...,the film shows how the norwegian public road a...,,1062896,3452.0,213.0,PT4M,,,UCiP3zWFVsqM9zXl61JZ_tCw
1,scriptiebot16a@gmail.com,1,Iha2SxMZvWY,nederlanders stelen de show! 🤩🇳🇱 | brugge vs a...,goals en highlights van club brugge vs anderle...,van harte welkom vanuit het jan breydelstadion...,87928,1769.0,27.0,PT8M10S,"je blijft juichen! \n\nde premier league, la l...","""ziggo sport"" ziggo ""formule 1"" ""max verstappe...",UCueOZEvMffxjJyVO1fGiJNQ
2,scriptiebot16a@gmail.com,1,VbAEO6La7i4,find schism206 content - old & new,find older schism206 videos/playlists:\nwebsit...,greetings regardless if you are new to this ch...,2081,,,PT5M51S,updates (subscribe for all notifications): htt...,"occult esoteric conspiracy ""new world order""",UCLi_ZKbpFFBT5Q1_FzUqh4g
3,scriptiebot16a@gmail.com,1,hpXCachboOw,the worlds longest indoor mountain bike trail,in the suburbs of cleveland ohio you can find ...,this past weekend april and i got the chance t...,1970742,31981.0,744.0,PT5M55S,this channel is here to help people get stoked...,"""mountain biking"" ""how to"" ""mountain bikes"" ""k...",UC4eegkSVzV56kTrSpvL6BKQ
4,scriptiebot16a@gmail.com,1,h4T_LlK1VE4,glitterbomb 3.0 vs. porch pirates,merry christmas ya filthy animals. if you wan...,this guy stole my package from a porch and he'...,36909540,1712505.0,20173.0,PT22M11S,former nasa and apple engineer. current youtu...,diy creative design hack hacker halloween cost...,UCY1kMZp36IQSyNx_9h4mpCg


In [13]:
# Voeg titel, beschrijving en transcript samen tot één kolom
recs["full_text"] = recs["title"] + " " + recs["description"] + " " + recs["transcript"]\
                    + " " + recs["channel_description"] + " " + recs["channel_keywords"]

recs = recs.drop(["title", "description", "transcript", "channel_keywords", "channel_description"], axis = 1)

In [14]:
recs["full_text"].fillna("", inplace=True)

In [15]:
def translate_text(text):
    # Detect language
    try:
        lang = detect(text)
    except:
        lang = "en"
    # If the text is not in English, translate it, otherwise just return
    if lang != "en":
        #The translator only handles texts of less than 5000 characters, so we have to split the text
        if len(text) >= 5000:
            text_split = [text[start:start+4999] for start in range(0, len(text), 4999)]
            try:
                return ''.join([GoogleTranslator(source="auto", target='en').translate(txt) for txt in text_split])
            # Deal with connections stutters
            except Exception as e:
                print(e)
                # Wait three seconds to let the connection stablize
                time.sleep(3)
                # Try again, and if it still doesn't work, return the plain text
                try:
                    return ''.join([GoogleTranslator(source="auto", target='en').translate(txt) for txt in text_split])
                except:
                    return text
                
        else:
            # Deal with connection issues
            try:
                return GoogleTranslator(source=lang, target="en").translate(text)
            except Exception as e:
                print(e)
                time.sleep(3)
                try:
                    return GoogleTranslator(source=lang, target="en").translate(text)
                except:
                    return text
    
    else:
        return text

In [16]:
tqdm.pandas()
# Vertaal alle text naar het Engels
recs['full_text'] = recs['full_text'].progress_apply(lambda txt: translate_text(txt))

  0%|          | 0/1360 [00:00<?, ?it/s]

Request exception can happen due to an api connection error. Please check your connection and try again
Request exception can happen due to an api connection error. Please check your connection and try again
Request exception can happen due to an api connection error. Please check your connection and try again


In [17]:
# Remove emoji's and other weird characters
recs["full_text"] = recs["full_text"].progress_apply(lambda text: text.encode("ascii", "ignore").decode())

  0%|          | 0/1360 [00:00<?, ?it/s]

In [18]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [19]:
def stem_text(text):
    stripped = [word.strip(string.punctuation) for word in word_tokenize(text) if word not in stop_words]
    return " ".join([stemmer.stem(word) for word in stripped if word])

In [20]:
# Stem transcripts en verwijder stopwords
recs['full_text'] = recs['full_text'].progress_apply(lambda text: stem_text(text))

  0%|          | 0/1360 [00:00<?, ?it/s]

In [21]:
recs["full_text"].fillna("", inplace=True)

In [22]:
recs.head()

Unnamed: 0,user,vids_watched,vid,views,likes,dislikes,duration,channel,full_text
0,scriptiebot16a@gmail.com,1,N8JLfT0r058,1062896,3452.0,213.0,PT4M,UCiP3zWFVsqM9zXl61JZ_tCw,wreckingbal helicopt use remov unstabl rock no...
1,scriptiebot16a@gmail.com,1,Iha2SxMZvWY,87928,1769.0,27.0,PT8M10S,UCueOZEvMffxjJyVO1fGiJNQ,bodi overflow auto import display block import...
2,scriptiebot16a@gmail.com,1,VbAEO6La7i4,2081,,,PT5M51S,UCLi_ZKbpFFBT5Q1_FzUqh4g,find schism206 content old new find older schi...
3,scriptiebot16a@gmail.com,1,hpXCachboOw,1970742,31981.0,744.0,PT5M55S,UC4eegkSVzV56kTrSpvL6BKQ,world longest indoor mountain bike trail subur...
4,scriptiebot16a@gmail.com,1,h4T_LlK1VE4,36909540,1712505.0,20173.0,PT22M11S,UCY1kMZp36IQSyNx_9h4mpCg,glitterbomb 3.0 vs porch pirat merri christma ...


In [23]:
recs.to_csv("recommendations_strat_4_clean_temp.csv")