In [None]:
# Install and import required libraries
!pip install praw transformers langdetect textblob  # Add textblob for polarity
import praw
import pandas as pd
from transformers import pipeline
from langdetect import detect  # For language detection
from textblob import TextBlob  # For polarity calculation
from google.colab import drive
import time  # For timestamp filtering

In [None]:
# Mount Google Drive to save files
drive.mount('/content/drive')

In [None]:
# Set up Reddit API credentials
# You need to edit this with your own Reddit API credentials
reddit = praw.Reddit(
    client_id="cGMXlBQnV1N5_EgJxRJnvQ",        # Replace with your Reddit app's client ID
    client_secret="itKGNCdex1hbmIzszIPFrt7_Ev9hIQ",# Replace with your Reddit app's client secret
    user_agent="VAR_Sentiment_Analysis_v1"       # Replace with a descriptive user agent, e.g., "VAR_Sentiment_Analysis_v1"
)
# How to get credentials: Go to https://www.reddit.com/prefs/apps, create an app, and copy the details.

In [None]:
# Define subreddits and search terms
subreddits = {
    "Premier League": "PremierLeague",
    "La Liga": "LaLiga",
    "Serie A": "seriea",
    "Bundesliga": "Bundesliga"
}
search_query = "VAR"  # Keyword to search for VAR-related posts

In [None]:
# Extract comments from Reddit posts (limited to 2024 onward)
data = {"League": [], "Post Title": [], "Comment": []}  # Dictionary to store data
start_2024 = int(time.mktime(time.strptime("2024-01-01", "%Y-%m-%d")))  # Unix timestamp for Jan 1, 2024

for league, subreddit_name in subreddits.items():
    print(f"Processing {league} (r/{subreddit_name})...")
    subreddit = reddit.subreddit(subreddit_name)
    # Search for top posts about VAR within the past year (edit limit to 6–10 posts)
    posts = subreddit.search(search_query, time_filter="year", limit=10)  # Initial limit higher to ensure 6–10 after filtering

    post_count = 0  # Track posts meeting the 2024 criteria
    for post in posts:
        if post.created_utc >= start_2024:  # Check if post is from 2024 or later
            post.comments.replace_more(limit=0)  # Load all comments
            for comment in post.comments.list()[:10]:  # Limit to 10 comments per post (edit as needed)
                data["League"].append(league)
                data["Post Title"].append(post.title)
                data["Comment"].append(comment.body)
            print(f"Extracted comments from: {post.title} (Created: {time.ctime(post.created_utc)})")
            post_count += 1
            if post_count >= 8:  # Stop at 8 posts (edit to 6–10 as needed)
                break
        else:
            print(f"Skipped {post.title} (Created: {time.ctime(post.created_utc)}) - Before 2024")

# Convert to DataFrame
df = pd.DataFrame(data)
print(f"Total posts processed: {post_count}")
print("Sample extracted data:")
print(df.head())

In [None]:
# Set up sentiment analysis and translation pipelines
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
# Pre-trained model for POSITIVE, NEUTRAL, NEGATIVE sentiment (trained on Twitter data)

# Translation pipeline for Romance languages (Spanish, Italian, etc.) to English
translator_romance = pipeline("translation", model="Helsinki-NLP/opus-mt-ROMANCE-en")
# Fallback translator for other languages (e.g., German) to English
translator_general = pipeline("translation", model="Helsinki-NLP/opus-mt-de-en")  # German-to-English; edit if needed

In [None]:
# Detect language, translate non-English comments, analyze sentiment (POSITIVE, NEUTRAL, NEGATIVE), and add polarity
sentiments = []
scores = []
translated_comments = []
polarities = []  # New list for polarity scores

for comment in df["Comment"]:
    try:
        comment_str = str(comment)  # Ensure comment is a string
        lang = detect(comment_str)  # Detect language (returns 'en', 'es', 'de', etc.)

        # If comment is not in English, translate it
        if lang != "en":
            # Use Romance translator for Spanish, Italian, etc.
            if lang in ["es", "it", "fr", "pt", "ro"]:
                translated = translator_romance(comment_str[:512])[0]["translation_text"]  # Truncate to 512 chars
            # Use general translator for other languages (e.g., German)
            else:
                translated = translator_general(comment_str[:512])[0]["translation_text"]  # Edit model if needed
            text_for_analysis = translated
            print(f"Translated '{comment_str[:50]}...' ({lang}) to '{translated[:50]}...'")
        else:
            text_for_analysis = comment_str  # Keep English comments as-is

        # Analyze sentiment on translated or original English text
        result = sentiment_analyzer(text_for_analysis[:512])[0]  # Truncate to 512 tokens
        # Map model labels (LABEL_0 = NEGATIVE, LABEL_1 = NEUTRAL, LABEL_2 = POSITIVE)
        label_map = {"LABEL_0": "NEGATIVE", "LABEL_1": "NEUTRAL", "LABEL_2": "POSITIVE"}
        sentiment = label_map[result["label"]]
        sentiments.append(sentiment)      # POSITIVE, NEUTRAL, or NEGATIVE
        scores.append(result["score"])    # Confidence score (0 to 1)
        translated_comments.append(text_for_analysis)  # Store the text used for analysis

        # Calculate polarity using TextBlob (-1 to +1)
        polarity = TextBlob(text_for_analysis).sentiment.polarity
        polarities.append(polarity)

    except Exception as e:
        sentiments.append("ERROR")
        scores.append(0.0)
        translated_comments.append(comment_str)  # Keep original if translation/analysis fails
        polarities.append(0.0)  # Default to 0 for errors
        print(f"Error processing comment: {comment_str[:50]}... | {e}")

# Add results to DataFrame
df["Translated Comment"] = translated_comments  # The text actually analyzed (translated or original)
df["Sentiment"] = sentiments
df["Sentiment Score"] = scores
df["Polarity"] = polarities  # New column for polarity (-1 to +1)

In [None]:
# Install and import visualization libraries
!pip install matplotlib seaborn  # Install Matplotlib and Seaborn
import matplotlib.pyplot as plt
import seaborn as sns

# Set Seaborn style for better aesthetics
sns.set(style="whitegrid")

# Plot 1: Bar plot of sentiment counts per league
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="League", hue="Sentiment", palette="viridis")
plt.title("Sentiment Counts by League (2024)", fontsize=14)
plt.xlabel("League", fontsize=12)
plt.ylabel("Number of Comments", fontsize=12)
plt.legend(title="Sentiment")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot 2: Box plot of sentiment scores by sentiment category
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x="Sentiment", y="Sentiment Score", palette="viridis")
plt.title("Sentiment Score Distribution (2024)", fontsize=14)
plt.xlabel("Sentiment", fontsize=12)
plt.ylabel("Sentiment Score", fontsize=12)
plt.tight_layout()
plt.show()

# Plot 3: Pie charts for each league (2x2 subplots)
fig, axes = plt.subplots(2, 2, figsize=(12, 10))  # 2 rows, 2 columns for 4 leagues
fig.suptitle("Sentiment Distribution by League (2024)", fontsize=16)

leagues = df["League"].unique()
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Negative (blue), Neutral (orange), Positive (green)

for i, league in enumerate(leagues):
    league_data = df[df["League"] == league]["Sentiment"].value_counts()
    sizes = [league_data.get("NEGATIVE", 0), league_data.get("NEUTRAL", 0), league_data.get("POSITIVE", 0)]
    labels = ["Negative", "Neutral", "Positive"]
    row, col = divmod(i, 2)
    axes[row, col].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    axes[row, col].set_title(league, fontsize=12)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# Plot 4: Box plot of polarity by league
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="League", y="Polarity", palette="viridis")
plt.title("Polarity Distribution by League (2024)", fontsize=14)
plt.xlabel("League", fontsize=12)
plt.ylabel("Polarity (-1 to +1)", fontsize=12)
plt.axhline(0, color='gray', linestyle='--', linewidth=1)  # Add neutral line at 0
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Save results to CSV
output_csv_path = "/content/drive/My Drive/Colab Notebooks/Project3/var_sentiment_2024.csv"  # Output path (edit folder if needed)
df.to_csv(output_csv_path, index=False)
print(f"Sentiment analysis results saved to {output_csv_path}")
print("Sample results:")
print(df.head())