<a href="https://colab.research.google.com/github/RishabhJha395/ytsentiment_analysis/blob/main/Copy_of_youtubeCommentSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from googleapiclient.discovery import build
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt
import nltk, re, os

nltk.download('stopwords')
nltk.download('punkt')


In [None]:
from google.colab import userdata

API_KEY = userdata.get("YOUTUBE_API_KEY")

In [None]:
def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    if match:
        return match.group(1)
    else:
        raise ValueError("Invalid YouTube video URL")


In [None]:
def fetch_video_details(video_id):
    youtube = build('youtube', 'v3', developerKey=API_KEY)

    request = youtube.videos().list(
        part="snippet,statistics",
        id=video_id
    )
    response = request.execute()

    if not response["items"]:
        return None

    data = response["items"][0]
    snippet = data["snippet"]
    stats = data["statistics"]

    # Fetch category name (requires extra API call)
    category_id = snippet["categoryId"]
    cat_request = youtube.videoCategories().list(
        part="snippet", id=category_id
    )
    cat_response = cat_request.execute()
    category_name = cat_response["items"][0]["snippet"]["title"]

    video_data = {
        "title": snippet["title"],
        "channel": snippet["channelTitle"],
        "published_date": snippet["publishedAt"],
        "category": category_name,
        "views": int(stats.get("viewCount", 0)),
        "likes": int(stats.get("likeCount", 0)) if "likeCount" in stats else 0,
        "comments": int(stats.get("commentCount", 0)) if "commentCount" in stats else 0,
    }
    return video_data


In [None]:
def process_youtube_video(video_url):
    # --- Extract video info ---
    video_id = extract_video_id(video_url)
    video_info = fetch_video_details(video_id)

    print("üé• Video Details:")
    for k, v in video_info.items():
        print(f"{k.title()}: {v}")
    print()

    # --- Fetch comments ---
    comments = fetch_comments(video_id)
    df = pd.DataFrame(comments, columns=["Comment"])

    # --- Clean and Analyze ---
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    import re
    from textblob import TextBlob
    import matplotlib.pyplot as plt

    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Clean comment text
    def clean_text(text):
        text = re.sub(r"http\S+", "", text)         # remove URLs
        text = re.sub(r"[^a-zA-Z]", " ", text)      # keep only letters
        text = text.lower()
        words = text.split()
        words = [ps.stem(w) for w in words if w not in stop_words]
        return " ".join(words)

    # Analyze sentiment using TextBlob
    def analyze_sentiment(text):
        if not text.strip():
            return "Neutral"
        polarity = TextBlob(text).sentiment.polarity
        if polarity > 0.1:
            return "Positive"
        elif polarity < -0.1:
            return "Negative"
        else:
            return "Neutral"

    df["Cleaned"] = df["Comment"].apply(clean_text)
    df["Sentiment"] = df["Cleaned"].apply(analyze_sentiment)

    # --- Summary ---
    summary = df["Sentiment"].value_counts().to_dict()
    total = len(df)
    for k in summary:
        summary[k] = round(summary[k] / total * 100, 2)

    print("\nüìä Sentiment Distribution (in %):")
    for k, v in summary.items():
        print(f"{k}: {v}%")

    # --- Save results ---
    df.to_csv("youtube_comments_with_sentiment.csv", index=False)
    print("\nüíæ Saved comments with sentiments to CSV")

    # --- Visualization ---
    plt.figure(figsize=(5, 5))
    plt.pie(summary.values(), labels=summary.keys(), autopct='%1.1f%%', startangle=90)
    plt.title("YouTube Comment Sentiment Pie Chart")
    plt.show()

    # --- Like Ratio ---
    like_ratio = 0
    if video_info["views"] > 0:
        like_ratio = round(video_info["likes"] / video_info["views"] * 100, 2)

    print(f"\nüëç Like Ratio: {like_ratio}% of viewers liked this video")

    # Return data for further use
    return df, summary, video_info


In [None]:
from googleapiclient.discovery import build
import pandas as pd




def fetch_comments(video_id, max_results=100):
    youtube = build("youtube", "v3", developerKey=API_KEY)
    comments = []

    # YouTube comments API
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=100,  # max 100 per page
        textFormat="plainText"
    )

    while request and len(comments) < max_results:
        response = request.execute()

        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)

        # Handle pagination
        request = youtube.commentThreads().list_next(request, response)

        if len(comments) >= max_results:
            break

    return comments

##Sentimental Analysis along with summary

In [None]:
video_url = input("Enter YouTube video URL: ")
df, summary, video_info = process_youtube_video(video_url)


##Most frequent words

In [None]:

from wordcloud import WordCloud

positive_comments = " ".join(df[df['Sentiment']=='Positive']['Cleaned'])
negative_comments = " ".join(df[df['Sentiment']=='Negative']['Cleaned'])

wc_pos = WordCloud(width=800, height=400, background_color='white', colormap='Green').generate(positive_comments)
wc_neg = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate(negative_comments)

plt.figure(figsize=(10,5))
plt.imshow(wc_pos)
plt.axis("off")
plt.title("Word Cloud - Positive Comments")
plt.show()

plt.figure(figsize=(10,5))
plt.imshow(wc_neg)
plt.axis("off")
plt.title("Word Cloud - Negative Comments")
plt.show()


##Sentiment Polarity Histogram

In [None]:
df["Polarity"] = df["Cleaned"].apply(lambda x: TextBlob(x).sentiment.polarity)

plt.figure(figsize=(6,4))
plt.hist(df["Polarity"], bins=20, color='skyblue', edgecolor='black')
plt.title("Sentiment Polarity Distribution")
plt.xlabel("Polarity (-1 = Negative, +1 = Positive)")
plt.ylabel("Comment Count")
plt.show()


##Emotion Classification

In [None]:
from transformers import pipeline

# Load the emotion classification model
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base"
)

sample = input("Enter the Comment for which you want the emotion     :")
result = emotion_classifier(sample)[0]   # get first (and only) prediction
print(result['label'].upper())



###Keyword Analysis

In [None]:
from collections import Counter

all_words = " ".join(df["Cleaned"]).split()
common_words = Counter(all_words).most_common(10)
print(common_words)


words, counts = zip(*common_words)
plt.bar(words, counts)
plt.title("Top 10 Common Words in Comments")
plt.xticks(rotation=45)
plt.show()


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")
summary_text = summarizer(" ".join(df["Comment"][:50]), max_length=80, min_length=25, do_sample=False)
print(summary_text[0]['summary_text'])
