<a href="https://colab.research.google.com/github/Sagaust/BD_Projects/blob/main/AfricanPedia_Youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
class YouTubeOperations:
    API_KEY = ''
    def __init__(self, path):
        self.path = path
        self.channels_ids = []
        self.youtube_urls = self.urls_to_id()
        with open("metadata.csv", mode='w', newline='', encoding='utf-8') as meta:
            writer = csv.writer(meta)
            writer.writerow([
                "Video_ID",
                "Channel_ID",
                "Video_Title",
                "Description",
                "Tags",
                "Likes",
                "Views",
                "Comments",
                "Published_At"
            ])

    def urls_to_id(self):
        df = pd.read_excel(self.path, usecols=[0]).rename({"Url": "URL"}, axis=1)
        df["URL"] = df["URL"].apply(self.extract_youtube_video_id)
        df = df.drop_duplicates().reset_index(drop=True)
        return df

    def extract_youtube_video_id(self, url):
        """Extract the video ID from a YouTube URL."""
        video_id = None
        match = re.search(r"(v=|youtu.be/|embed/)([a-zA-Z0-9_-]{11})", url)
        if match:
            video_id = match.group(2)
        return video_id

    def fetch_video_metadata(self, video_id, api_key=API_KEY):
        """Fetch video metadata such as title, description, and channel details."""
        try:
            youtube = build('youtube', 'v3', developerKey=api_key)
            request = youtube.videos().list(
                part='snippet,contentDetails,statistics',
                id=video_id
            )
            response = request.execute()

            video_data = response["items"][0]["snippet"]
            statistics = response["items"][0]["statistics"]

            self.channels_ids.append(video_data['channelId'])

            print(video_id)

            values = {
                'video_id': video_id,
                'channelId': video_data['channelId'],
                'title': video_data['title'],
                'description': video_data['description'] if "description" in video_data else None,
                'tags': " ".join(video_data["tags"]) if "tags" in video_data else None,
                'likeCount': statistics["likeCount"] if "likeCount" in statistics else None,
                'viewCount': statistics["viewCount"] if "viewCount" in statistics else None,
                'commentCount': statistics["commentCount"] if "commentCount" in statistics else None,
                'publishedAt': video_data['publishedAt'] if "publishedAt" in video_data else None,
            }

            with open("metadata.csv", mode='a+', newline='', encoding='utf-8') as meta:
                meta_write = csv.writer(meta)
                meta_write.writerow(list(values.values()))

            return values["channelId"]
        except HttpError as e:
            print(f"An error occurred fetching metadata for video ID {video_id}: {e}")
            return None
        except IndexError as ie:
            print(f"Could not generate items: {ie}")
            return None

    def fetch_youtube_comments(self, video_id, api_key=API_KEY):
        """Fetch top-level comments, replies, and commenter information."""
        comments = []
        replies = []
        try:
            youtube = build('youtube', 'v3', developerKey=api_key)
            request = youtube.commentThreads().list(
                part='snippet,replies',
                videoId=video_id,
                maxResults=100,
                textFormat='plainText'
            )

            while request:
                response = request.execute()
                for item in response['items']:
                    comment = item['snippet']['topLevelComment']['snippet']
                    comment_info = {
                        'commentID': item['snippet']['topLevelComment']['id'],
                        'videoID': video_id,
                        'author': comment['authorDisplayName'],
                        'authorChannelId': comment['authorChannelId']['value'] if 'authorChannelId' in comment else None,
                        'text': comment['textDisplay'],
                        'likeCount': comment['likeCount'],
                        'publishedAt': comment['publishedAt'],
                    }
                    # Check for replies
                    if 'replies' in item:
                        # print(item)
                        for reply in item['replies']['comments']:
                            reply_info = {
                                'replyID': reply["id"].split(".")[1],
                                'CommentID': reply["id"].split(".")[0],
                                'author': reply['snippet']['authorDisplayName'],
                                'authorChannelId': reply['snippet']['authorChannelId']['value'] if 'authorChannelId' in reply['snippet'] else None,
                                'text': reply['snippet']['textDisplay'],
                                'likeCount': reply['snippet']['likeCount'],
                                'publishedAt': reply['snippet']['publishedAt']
                            }
                        replies.append(reply_info)

                    comments.append(comment_info)

                # Get next page of comments if available
                if 'nextPageToken' in response:
                    request = youtube.commentThreads().list_next(request, response)
                else:
                    break

        except HttpError as e:
            print(f"An error occurred fetching comments for video ID {video_id}: {e}")
            return None

        return comments, replies

    def save_data_to_csv(self, video_id):
        """Save video metadata, comments, and replies to a CSV file."""
        comments_path = f"./comments/{video_id}_comments.csv"
        replies_path = f"./replies/{video_id}_replies.csv"
        my_file = Path(comments_path)
        comments_column_names = [
            "CommentID", "Video_ID", "Commenter_Name", "Commenter_Channel_ID", "Comment_Text", "Likes", "Published_At"
        ]
        replies_column_names = [
            "ReplyID", "CommentID", "Commenter_Name", "Commenter_Channel_ID", "Comment_Text", "Likes", "Published_At"
        ]

        channel_id = self.fetch_video_metadata(video_id)


        comments, replies = self.fetch_youtube_comments(video_id)

        if not comments:
            return None
        comments_df = pd.DataFrame(comments)
        comments_df.columns = comments_column_names
        comments_df.to_csv(comments_path, index=False)
        replies_df = pd.DataFrame(replies)
        replies_df.columns = replies_column_names
        replies_df.to_csv(replies_path, index=False)

        comments_df.to_csv(comments_path, index=False)
        replies_df.to_csv(replies_path, index=False)

    def get_channel_videos(self, channel_id):
        videos = scrapetube.get_channel(channel_id)
        channel_dict = {"videoId": [], "videoTitle": []}

        for video in videos:
            channel_dict["videoId"].append(video['videoId'])
            channel_dict["videoTitle"].append(video['title']['runs'][0]['text'])
            # break
        channel_df = pd.DataFrame(channel_dict)
        channel_df["channelId"] = [channel_id] * channel_df.shape[0]
        channel_df = channel_df[["channelId", "videoId", "videoTitle"]]
        channel_df.to_csv(f"./other_videos_from_channels/{channel_id}.csv", index=False)

    def round_up(self):
        video_ids = self.youtube_urls["URL"]
        for video_id in video_ids:
            try:
                self.save_data_to_csv(video_id)
            except:
                continue
        channel_ids = pd.DataFrame(self.channels_ids, columns=["channelId"])
        channel_ids = channel_ids.drop_duplicates().dropna()
        print("Done with Videos, Comments, and Replies")
        for channel_id in channel_ids["channelId"]: # use the self.channel_ids instead
            self.get_channel_videos(channel_id)
        print("Ces't finis")