In [1]:
# 📦 STEP 1: Install required libraries
!pip install google-api-python-client textblob nltk

# 📥 STEP 2: Import packages
import pandas as pd
import time
import re
import nltk
import pickle
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from googleapiclient.discovery import build
import warnings
warnings.filterwarnings("ignore")


# 🧠 STEP 3: Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# 🔐 STEP 4: YouTube API Setup
api_key = "AIzaSyCXwVJPuDmFBhnZR1jp2JR8hlgjUPCoH0o"
youtube = build('youtube', 'v3', developerKey=api_key)

# 💬 STEP 5: Comment Fetching Function
def get_comments(video_id, limit=5000, pause=1):
    comments_data = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=100,
        textFormat="plainText"
    )
    response = request.execute()

    while response and len(comments_data) < limit:
        for item in response['items']:
            snippet = item['snippet']['topLevelComment']['snippet']
            comments_data.append({
                "video_id": video_id,
                "comment_id": item['id'],
                "author": snippet.get("authorDisplayName", "Unknown"),
                "comment": snippet.get("textDisplay", ""),
                "like_count": snippet.get("likeCount", 0),
                "published_at": snippet.get("publishedAt", ""),
                "updated_at": snippet.get("updatedAt", ""),
                "reply_count": item['snippet'].get("totalReplyCount", 0)
            })
            if len(comments_data) >= limit:
                break

        if 'nextPageToken' in response and len(comments_data) < limit:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                pageToken=response['nextPageToken'],
                maxResults=100,
                textFormat="plainText"
            )
            response = request.execute()
            time.sleep(pause)
        else:
            break

    return pd.DataFrame(comments_data)

# 🧹 STEP 6: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

# ❤️ STEP 7: Sentiment Labeling
def get_sentiment(text):
    if pd.isnull(text) or text.strip() == "":
        return "neutral"
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return "positive"
    elif polarity < 0:
        return "negative"
    else:
        return "neutral"

# 🚀 STEP 8: Full Pipeline Execution
video_id = "Z4hVGCWH1Kc"  # MrBeast video
print("Fetching comments...")
df = get_comments(video_id, limit=5000)
print("Total comments fetched:", len(df))

print("Preprocessing text and analyzing sentiment...")
df["clean_comment"] = df["comment"].apply(preprocess_text)
df["sentiment"] = df["clean_comment"].apply(get_sentiment)

# Select final 10 columns
df_final = df[[
    "video_id", "comment_id", "author", "comment", "like_count",
    "published_at", "updated_at", "reply_count", "clean_comment", "sentiment"
]]

print("Final shape:", df_final.shape)

# 💾 STEP 9: Save using Pickle
with open("youtube_comments_sentiment.pkl", "wb") as f:
    pickle.dump(df_final, f)

print("Saved as youtube_comments_sentiment.pkl ✅")

# ✅ STEP 10: Load and verify
with open("youtube_comments_sentiment.pkl", "rb") as f:
    loaded_df = pickle.load(f)

loaded_df.head()



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAMAKRISHNA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAMAKRISHNA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RAMAKRISHNA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fetching comments...
Total comments fetched: 5000
Preprocessing text and analyzing sentiment...
Final shape: (5000, 10)
Saved as youtube_comments_sentiment.pkl ✅


Unnamed: 0,video_id,comment_id,author,comment,like_count,published_at,updated_at,reply_count,clean_comment,sentiment
0,Z4hVGCWH1Kc,Ugw_Rsdge_gVn71T1uZ4AaABAg,@MrBeast,Go to https://www.teamwater.org to donate!,29400,2025-08-01T16:02:00Z,2025-08-01T16:02:00Z,560,go httpswwwteamwaterorg donate,neutral
1,Z4hVGCWH1Kc,UgzpYUhNfLQywSicX-h4AaABAg,@marvleman,Wee ar always with u,0,2025-08-04T12:37:27Z,2025-08-04T12:37:27Z,0,wee ar always u,neutral
2,Z4hVGCWH1Kc,UgwRpyR7deCiusgHpnZ4AaABAg,@Svsilva34,🫶🔥,0,2025-08-04T12:36:36Z,2025-08-04T12:36:36Z,0,,neutral
3,Z4hVGCWH1Kc,UgwRe_f0CQeuLuznIeF4AaABAg,@EducationEntertainment-vm3ho,Namaste sir can I ask a question ❓ I from Nepal,1,2025-08-04T12:34:58Z,2025-08-04T12:34:58Z,0,namaste sir ask question nepal,neutral
4,Z4hVGCWH1Kc,UgxMs45zcpDGEBq19Yh4AaABAg,@JohnKombi-x6d,Cam to DRC 🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩🇨🇩,0,2025-08-04T12:34:52Z,2025-08-04T12:34:52Z,0,cam drc,neutral
