### Getting data

In [None]:
import os
import re
import csv
import openai
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set your YouTube API key here
YOUTUBE_API_KEY = "youtube"

# Download VADER lexicon if not already available
nltk.download('vader_lexicon', quiet=True)

#########################
# Step 1: Data Retrieval
#########################

def extract_video_id(url):
    """
    Extract the video ID from a YouTube URL.
    Supports standard and shortened URLs.
    """
    regex = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(regex, url)
    if match:
        return match.group(1)
    else:
        raise ValueError("Invalid YouTube URL provided.")

def get_youtube_service():
    """
    Create and return a YouTube service object using the API key.
    """
    return build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

def fetch_video_title(video_id):
    """
    Retrieve the video title using the YouTube Data API.
    """
    youtube = get_youtube_service()
    request = youtube.videos().list(part="snippet", id=video_id)
    response = request.execute()
    if response.get("items"):
        return response["items"][0]["snippet"]["title"]
    return None

def fetch_video_comments(video_id, max_results=100):
    """
    Fetch up to 'max_results' comments for the given video.
    """
    youtube = get_youtube_service()
    comments = []
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        maxResults=max_results,
        textFormat="plainText"
    )
    response = request.execute()

    while True:
        for item in response.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
        if "nextPageToken" in response:
            request = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=max_results,
                pageToken=response["nextPageToken"],
                textFormat="plainText"
            )
            response = request.execute()
        else:
            break
    return comments

def fetch_video_transcript(video_id):
    """
    Attempt to retrieve the video transcript.
    If unavailable via the YouTube transcript API, this is where you could integrate a Whisper ASR.
    """
    try:
        transcript_segments = YouTubeTranscriptApi.get_transcript(video_id)
        transcript = " ".join(segment["text"] for segment in transcript_segments)
        return transcript
    except Exception as e:
        print(f"Transcript not available via API: {e}")
        # Placeholder: Integrate Whisper ASR for audio transcription if needed.
        return None

def analyze_video(video_url):
    """
    Given a YouTube video URL, fetch and return the video title, transcript, and comments.
    """
    video_id = extract_video_id(video_url)
    title = fetch_video_title(video_id)
    transcript = fetch_video_transcript(video_id)
    comments = fetch_video_comments(video_id)
    return {
        "video_id": video_id,
        "title": title,
        "transcript": transcript,
        "comments": comments
    }

def save_comments_to_csv(comments, filename="video_comments.csv"):
    """
    Save a list of comment strings to a CSV file with a single column 'comment'.
    """
    with open(filename, "w", encoding="utf-8", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["comment"])
        for comment in comments:
            writer.writerow([comment])
    print(f"Saved {len(comments)} comments to {filename}")

def load_comments_from_csv(filename="video_comments.csv"):
    """
    Load comments from a CSV file and return them as a list of strings.
    Assumes the CSV file has a header with the column 'comment'.
    """
    comments = []
    with open(filename, "r", encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            comments.append(row["comment"])
    return comments


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Retrieve video data
video_url = input("Enter YouTube video URL: ")
video_data = analyze_video(video_url)
print("\n--- Video Data ---")
print("Video Title:", video_data["title"])
if video_data["transcript"]:
    print("Transcript (first 500 chars):", video_data["transcript"][:500] + "...")
else:
    print("Transcript: Not available.")
print("Number of comments fetched:", len(video_data["comments"]))

# Save comments to CSV
save_comments_to_csv(video_data["comments"], "video_comments.csv")

# Load comments from CSV for sentiment analysis
comments_from_csv = load_comments_from_csv("video_comments.csv")


--- Video Data ---
Video Title: Re-Grading The TERRIBLE 2024 NBA Draft...
Transcript (first 500 chars): the 2024 NBA draft was this unbelievable class we all knew that at the time there was so much talent in this okay that obviously that wasn't true this class was terrible everybody knew it was terrible at the time but now that we've seen these guys play for a couple months I wanted to go back and regrade all these picks because there's some really interesting guys at the top of this class also I'd be super down to do this for a bunch of other draft classes in the past as well but only if you guys...
Number of comments fetched: 283
Saved 283 comments to video_comments.csv


### Scoring system

In [3]:
tokenizer = AutoTokenizer.from_pretrained("rinapch/distilbert-media-bias")
model = AutoModelForSequenceClassification.from_pretrained("rinapch/distilbert-media-bias")

def calculate_bias_score(transcript):
    """
    Calculate a bias score for a transcript using a media bias classifier model.

    Assumptions:
    - The model outputs probabilities for multiple bias classes.
    - Index 1 corresponds to the "Center" (neutral) class.

    The bias score is defined as the neutral probability.
    If the transcript is empty, a neutral score of 0.5 is returned.
    """
    if not transcript:
        return 0.5  # Assume neutral if transcript is unavailable

    # Tokenize the transcript with truncation to manage long inputs.
    inputs = tokenizer(transcript, return_tensors="pt", truncation=True)

    # Obtain the model's logits.
    outputs = model(**inputs)
    logits = outputs.logits

    # Convert logits to probabilities using softmax.
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # Assume index 1 corresponds to the "Center" (neutral) class.
    neutral_probability = probabilities[0][1].item()

    # The bias score is the neutral probability, ensuring it is within [0, 1].
    bias_score = max(0, min(1, neutral_probability))
    return bias_score

def calculate_misinformation_score(transcript):
    """
    Calculate a misinformation score based on the transcript.
    A robust implementation would integrate a fact-checking API (e.g., Google Fact-Check Tools API)
    to verify claims made in the video.

    Here we use a dummy heuristic by checking for sentences that include numerical claims,
    assuming that more concrete claims might be more verifiable.
    """
    if not transcript:
        return 0.5  # Neutral if transcript is unavailable

    sentences = transcript.split(".")
    claim_sentences = [s for s in sentences if re.search(r"\d+", s)]
    claim_count = len(claim_sentences)
    # Normalize by total sentences: more claims (up to a point) can lower misinformation risk.
    score = claim_count / (len(sentences) + 1)
    score = min(1, score)
    return score

def calculate_toxic_score(comments):
   """
    Calculate the average sentiment score from the video comments using VADER.
    The compound sentiment score from VADER ranges from -1 (negative) to 1 (positive).
    This function normalizes it to a 0-to-1 scale.
    """

   model_path = "martin-ha/toxic-comment-model"
   tokenizer = AutoTokenizer.from_pretrained(model_path)
   model = AutoModelForSequenceClassification.from_pretrained(model_path)
   pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

   if not comments:
      return 0.5  # Default to neutral if no comments are provided

   total_score = 0.0
   for comment in comments:
      results = pipeline(comment)
      # Handle potential nesting: if results is a list of lists, flatten it
      if results and isinstance(results[0], list):
          results = results[0]

      non_toxic_score = None
      # Look for a label indicating non-toxicity
      for res in results:
          label = res['label'].lower()
          if label in ['non_toxic', 'non-toxic', 'clean']:
              non_toxic_score = res['score']
              break
      # Fallback: if non-toxic isn't explicitly provided, invert the toxic score if available
      if non_toxic_score is None:
          for res in results:
              if res['label'].lower() == 'toxic':
                  non_toxic_score = 1 - res['score']
                  break
      # If neither label is found, default to a neutral score
      if non_toxic_score is None:
          non_toxic_score = 0.5

      total_score += non_toxic_score

   average_score = total_score / len(comments)
   return average_score

def calculate_integrity_score(bias_score, misinformation_score, sentiment_score):
    """
    Combine the sub-scores into an overall integrity score (scale 1-10) using a weighted approach.
    Weights:
      - Bias: 40%
      - Misinformation: 40%
      - Sentiment: 20%
    """
    weighted = (bias_score * 0.4) + (misinformation_score * 0.4) + (toxic_score * 0.2)
    integrity_score = 1 + (weighted * 9)  # Scale from [0,1] to [1,10]
    explanation = (f"Weighted calculation: bias={bias_score:.2f} (40%), "
                   f"misinformation={misinformation_score:.2f} (40%), "
                   f"toxic={toxic_score:.2f} (20%). "
                   f"Final integrity score: {integrity_score:.2f}/10.")
    return integrity_score, explanation

In [4]:
bias_score = calculate_bias_score(video_data["transcript"])
misinformation_score = calculate_misinformation_score(video_data["transcript"])
toxic_score = calculate_toxic_score(video_data["comments"])

print("\n--- Sub-Scores ---")
print(f"Bias Score: {bias_score:.2f}")
print(f"Misinformation Score: {misinformation_score:.2f}")
print(f"Sentiment Score: {toxic_score:.2f}")

# Combine sub-scores into an overall integrity score
integrity_score, explanation = calculate_integrity_score(bias_score, misinformation_score, toxic_score)
print("\n--- Integrity Report ---")
print("Overall Integrity Score:", integrity_score)
print("Explanation:", explanation)

Device set to use mps:0



--- Sub-Scores ---
Bias Score: 0.05
Misinformation Score: 0.67
Sentiment Score: 0.92

--- Integrity Report ---
Overall Integrity Score: 5.242932601149322
Explanation: Weighted calculation: bias=0.05 (40%), misinformation=0.67 (40%), toxic=0.92 (20%). Final integrity score: 5.24/10.


### Sentiment analysis and separate comment in to good and bad file

In [None]:
# sentiment analysis done and save to 2 file bad_review.csv and good_review.csv



# Load the sentiment analysis model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

def analyze_sentiment(text):
    """Get sentiment score from text using BERT-based model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    scores = outputs.logits.softmax(dim=1)  # Convert to probabilities
    sentiment_score = torch.argmax(scores).item() + 1  # Convert index to sentiment score (1-5)
    return sentiment_score

def categorize_and_save_reviews(input_csv, good_reviews_csv, bad_reviews_csv):
    """Categorizes comments into good and bad reviews based on sentiment score."""
    with open(input_csv, "r", encoding="utf-8") as infile, \
         open(good_reviews_csv, "w", encoding="utf-8", newline="") as good_outfile, \
         open(bad_reviews_csv, "w", encoding="utf-8", newline="") as bad_outfile:

        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ["sentiment_score"]

        # Prepare CSV writers
        good_writer = csv.DictWriter(good_outfile, fieldnames=fieldnames)
        bad_writer = csv.DictWriter(bad_outfile, fieldnames=fieldnames)

        # Write headers
        good_writer.writeheader()
        bad_writer.writeheader()

        for row in reader:
            comment_text = row["comment"]
            sentiment = analyze_sentiment(comment_text)
            row["sentiment_score"] = sentiment

            # Categorize based on sentiment score
            if sentiment >= 3:
                good_writer.writerow(row)
            else:
                bad_writer.writerow(row)

    print(f"Sentiment analysis completed.")
    print(f"Good reviews saved to: {good_reviews_csv}")
    print(f"Bad reviews saved to: {bad_reviews_csv}")

# Run sentiment analysis and categorize comments
categorize_and_save_reviews(
    "video_comments.csv",
    "good_reviews.csv",
    "bad_reviews.csv"
)

Sentiment analysis completed.
Good reviews saved to: good_reviews.csv
Bad reviews saved to: bad_reviews.csv


In [7]:
# Prepare the percentage of good review for streamlit dashboard in future

def count_reviews(good_reviews_csv, bad_reviews_csv):
    """Count the number of good and bad reviews from CSV files."""

    def count_rows(csv_file):
        """Helper function to count rows in a CSV file (excluding the header)."""
        with open(csv_file, "r", encoding="utf-8") as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            return sum(1 for _ in reader)

    good_count = count_rows(good_reviews_csv)
    bad_count = count_rows(bad_reviews_csv)

    print(f"Total Good Reviews: {good_count}")
    print(f"Total Bad Reviews: {bad_count}")

    return good_count, bad_count

# Run the function to count reviews
good_count, bad_count = count_reviews("good_reviews.csv", "bad_reviews.csv")

good_percentage = good_count/(good_count + bad_count)
print(good_percentage)

Total Good Reviews: 169
Total Bad Reviews: 114
0.5971731448763251


### LLM for summary and generation

In [None]:


# OpenAI API Key (Replace with your own key)
OPENAI_API_KEY = "openai"

# Set up OpenAI client
openai.api_key = OPENAI_API_KEY

def read_reviews(csv_file):
    """Read comments from CSV file and return as a single text block."""
    reviews = []
    with open(csv_file, "r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        
        if reader.fieldnames:
            print(f"Detected column headers: {reader.fieldnames}")  # Debugging output
        
        for row in reader:
            reviews.append(row.get("text", "").strip())  # Use .get() to avoid KeyError
    
    return " ".join(reviews)


def summarize_good(text, review_type):
    """Use GPT-4o mini API to summarize reviews."""
    prompt = f"""
    You are a professional YouTube content strategist. Your task is to summarize the following {review_type} reviews.
    Identify recurring themes and patterns in the feedback. Highlight what resonates most with the audience. Summarize positive aspects that should be further developed.

    **Reviews:**
    {text}

    **Provide a clear, structured summary of the main points in 3-4 sentences.**
    """

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You are an expert in content analysis."},
                  {"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

def summarize_bad(text, bias_score, misinformation_score, sentiment_score, review_type):
    """Use GPT-4o mini API to analyze reviews and identify areas for improvement."""

    prompt = f"""
    You are a professional YouTube content strategist specializing in media integrity analysis. Your task is to summarize the following {review_type} reviews and integrity score while identifying key areas for improvement.

    **Reviews:**
    {text}

    **Analysis Parameters:**
    - **Bias Score:** {bias_score} (Higher indicates stronger bias)
    - **Misinformation Score:** {misinformation_score} (Higher indicates greater misinformation risk)
    - **Toxic speech Score:** {toxic_score} (Higher indicates greater toxic)

    **Your task:**
    - Summarize key themes and audience request concisely.
    - Identify patterns related to bias, misinformation, and sentiment.
    - Highlight the top one area where the content can be improved to enhance credibility.
    - Highlight the top one area where the content can be improved to enhance engagement.

    **Response Format:**
    - **Summary:** (3-4 sentences capturing the key insights)
    - **Areas of Improvement:** (Bullet points outlining specific issues, and viewer requests)
    - **Recommendations:** (2-3 actionable suggestions for enhancing content quality)
    """

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You are an expert in YouTube content strategy and media integrity analysis."},
                  {"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

def generate_video_ideas(good_summary, bad_summary):
    pass

# Read and summarize reviews
good_reviews_text = read_reviews("good_reviews.csv")
bad_reviews_text = read_reviews("bad_reviews.csv")

print("Summarizing Good Reviews...")
good_summary = summarize_good(good_reviews_text, "good")
print("\nSummarizing Bad Reviews...")
bad_summary = summarize_bad(bad_reviews_text, bias_score, misinformation_score, toxic_score, "bad")

# Generate improved video ideas
print("\nGenerating Better Video Ideas...")
video_ideas = generate_video_ideas(good_summary, bad_summary)

# Print results
print("\n### Summary of Good Reviews ###")
print(good_summary)

print("\n### Summary of Bad Reviews ###")
print(bad_summary)

print("\n### Suggested Video Ideas ###")
print(video_ideas)

Detected column headers: ['comment', 'sentiment_score']
Detected column headers: ['comment', 'sentiment_score']
Summarizing Good Reviews...

Summarizing Bad Reviews...

Generating Better Video Ideas...

### Summary of Good Reviews ###
The reviews consistently praise the channel for its engaging and informative content, highlighting the creator's ability to explain complex topics in a simple and relatable manner. Viewers appreciate the high production quality and the visually appealing presentation, which enhances their learning experience. The friendly, approachable style of the host is frequently mentioned as a key factor that keeps audiences coming back. To further capitalize on this positive reception, the channel could expand on interactive content, such as Q&A sessions or community challenges, to foster deeper viewer engagement.

### Summary of Bad Reviews ###
**Summary:** The reviews highlight significant concerns regarding the high levels of misinformation and toxic speech withi

### Rag LLM