**Here we combine all data gathered to discover the Overall Sentiment**

In [None]:
# Install required libraries if not already installed
!pip install wordcloud
!pip install googletrans==3.1.0a0  # For translation in word cloud

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from wordcloud import WordCloud
from googletrans import Translator, LANGUAGES
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# Mount Google Drive
drive.mount('/content/drive')

# Set the path to your files
path = "/content/drive/My Drive/Colab Notebooks/Project3/"

In [None]:
# Define file names (update these to match your actual file names)
file_names = {
    'audio': path + "audio_data.csv",
    'web': path + "web_data.csv",
    'reddit': path + "reddit_data.csv"
}

# Columns to keep and standardize across sources
columns_to_keep = ['league_name', 'comment', 'sentiment_label', 'sentiment_score']

# Dictionary to store dataframes
dfs = {}

# Read and filter each CSV, adding source identifier
for source, file_path in file_names.items():
    try:
        df = pd.read_csv(file_path)
        # Check if required columns exist
        missing_cols = [col for col in columns_to_keep if col not in df.columns]
        if missing_cols:
            print(f"Warning: {source} data missing columns: {missing_cols}")
        # Filter to keep only specified columns that exist and create a copy
        available_cols = [col for col in columns_to_keep if col in df.columns]
        df_filtered = df[available_cols].copy()
        # Add source column
        df_filtered.loc[:, 'source'] = source.capitalize()  # e.g., "Audio", "Web", "Reddit"
        dfs[source] = df_filtered
    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
    except Exception as e:
        print(f"Error loading {source} data: {str(e)}")

# Combine available dataframes and transform sentiment scores
if dfs:
    combined_df = pd.concat(dfs.values(), ignore_index=True)
    # Ensure sentiment_score is numeric
    combined_df['sentiment_score'] = pd.to_numeric(combined_df['sentiment_score'], errors='coerce')
    # Transform 0-1 range to -1-1 range: (score * 2) - 1
    combined_df['sentiment_score'] = (combined_df['sentiment_score'] * 2) - 1
    # If sentiment_label is "NEGATIVE", make the score negative
    combined_df.loc[combined_df['sentiment_label'].str.upper() == 'NEGATIVE', 'sentiment_score'] = -abs(combined_df['sentiment_score'])

    # Reclassify sentiment labels based on transformed scores to ensure consistency
    def classify_sentiment(score):
        if pd.isna(score):
            return 'NEUTRAL'  # Handle NaN scores
        elif score < -0.25:
            return 'NEGATIVE'
        elif score > 0.25:
            return 'POSITIVE'
        else:
            return 'NEUTRAL'

    # Apply classification to update Sentiment labels
    combined_df['sentiment_label'] = combined_df['sentiment_score'].apply(classify_sentiment)

    # Rename columns for clarity
    combined_df = combined_df.rename(columns={
        'league_name': 'League',
        'comment': 'Sentence',
        'sentiment_label': 'Sentiment',
        'sentiment_score': 'Score',
        'source': 'Source'
    })
    # Save combined dataframe with all columns
    combined_output_path = path + "finalproject.csv"
    combined_df.to_csv(combined_output_path, index=False)
    print(f"Combined CSV saved as '{combined_output_path}'")
    print(f"Total rows in combined data: {len(combined_df)}")
    print("Columns in combined CSV:", combined_df.columns.tolist())
    # Display first few rows for verification
    print("\nSample of combined data:")
    print(combined_df.head())
else:
    print("No dataframes were successfully loaded.")

In [None]:
# Check if combined_df exists
if 'combined_df' in globals():
    # Calculate average sentiment score per league
    avg_sentiment = combined_df.groupby('League')['Score'].mean().reset_index()

    # Add a column to classify sentiment as Negative, Neutral, or Positive
    def classify_sentiment(score):
        if score < -0.25:
            return 'Negative'
        elif score > 0.25:
            return 'Positive'
        else:
            return 'Neutral'

    avg_sentiment['sentiment_class'] = avg_sentiment['Score'].apply(classify_sentiment)
    print("\nAverage Sentiment Score per League (-1 to 1 Scale):")
    print(avg_sentiment)

    # Count sentiment labels per league
    sentiment_counts = combined_df.groupby(['League', 'Sentiment']).size().unstack(fill_value=0)
    print("\nSentiment Label Counts per League:")
    print(sentiment_counts)

    # Count total entries per league
    total_entries_per_league = combined_df.groupby('League').size().reset_index(name='Total Entries')
    print("\nTotal Entries per League:")
    print(total_entries_per_league)

    # Check the range of sentiment scores
    min_score = combined_df['Score'].min()
    max_score = combined_df['Score'].max()
    print(f"\nSentiment Score Range: {min_score} to {max_score}")
else:
    print("Error: Combined dataframe not available. Check previous cell for issues.")

In [None]:
# Check if combined_df exists
if 'combined_df' in globals():
    # Set Seaborn palette
    sns.set_palette("husl")

    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))

    # Bar plot for average sentiment scores with color coding
    palette = sns.color_palette(["#ff4d4d", "#cccccc", "#4dff4d"], n_colors=3)  # Red for neg, gray for neutral, green for pos
    sns.barplot(x='League', y='Score', hue='sentiment_class', dodge=False,
                palette=palette, data=avg_sentiment, ax=ax1)
    ax1.set_title('Average Sentiment Score by League (VAR)', fontsize=14, pad=15)
    ax1.set_xlabel('League', fontsize=12)
    ax1.set_ylabel('Average Sentiment Score (-1 to 1)', fontsize=12)
    ax1.tick_params(axis='x', rotation=45)
    ax1.axhline(0, color='gray', linestyle='--', linewidth=1)  # Neutral line at 0
    ax1.set_ylim(-1, 1)  # Set y-axis limits to match -1 to 1 range
    ax1.legend(title='Sentiment', loc='best')

    # Add value labels on top of bars
    for i, v in enumerate(avg_sentiment['Score']):
        ax1.text(i, v, f'{v:.2f}', ha='center', va='bottom' if v >= 0 else 'top')

    # Heatmap for sentiment label counts
    sns.heatmap(sentiment_counts, annot=True, fmt='d', cmap='YlOrRd', ax=ax2)
    ax2.set_title('Sentiment Label Distribution by League', fontsize=14, pad=15)
    ax2.set_xlabel('Sentiment Label', fontsize=12)
    ax2.set_ylabel('League', fontsize=12)

    # Adjust layout and display
    plt.tight_layout()
    plt.show()
else:
    print("Error: Cannot visualize data. Combined dataframe not available.")

In [None]:
# Initialize the translator
translator = Translator()

# Check if combined_df exists
if 'combined_df' in globals():
    # Function to translate text to English
    def translate_to_english(text):
        try:
            # Detect language
            detection = translator.detect(text)
            if detection.lang != 'en' and detection.confidence > 0.8:  # Only translate if not English and high confidence
                translated = translator.translate(text, dest='en')
                return translated.text
            return text  # Return original if English or low confidence
        except Exception as e:
            print(f"Translation error: {e}")
            return text  # Return original on error

    # Apply translation to each sentence (formerly 'comment')
    translated_sentences = combined_df['Sentence'].dropna().astype(str).apply(translate_to_english)

    # Combine all translated sentences into a single string
    all_sentences = ' '.join(translated_sentences)

    # Generate the word cloud with translated text
    wordcloud = WordCloud(width=800, height=400,
                          background_color='white',
                          min_font_size=10,
                          max_words=100).generate(all_sentences)

    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of VAR Sentiments', fontsize=14, pad=15)
    plt.show()
else:
    print("Error: Cannot generate word cloud. Combined dataframe not available.")

In [None]:
# Create a visual pipeline using Matplotlib
plt.figure(figsize=(12, 8))

# Define pipeline steps (aligned with your 7-step request)
steps = [
    "1. Data Ingestion\n(Audio, Web, Reddit)",
    "2. Data Preprocessing\n(Clean & Split)",
    "3. Language Standardization\n(Translate to English)",
    "4. Sentiment Analysis\n(Predict Labels & Scores)",
    "5. Result Aggregation\n(Combine Sources)",
    "6. Evaluation\n(Metrics & Validation)",
    "7. Output & Visualization\n(CSV, Plots, Word Cloud)"
]

# Positions for each step
y_positions = [0.9, 0.75, 0.6, 0.45, 0.3, 0.15, 0.0]
x_position = 0.5

# Draw boxes and text
for i, (step, y) in enumerate(zip(steps, y_positions)):
    plt.text(x_position, y, step, ha='center', va='center', fontsize=11,
             bbox=dict(facecolor='lightblue', edgecolor='black', boxstyle='round,pad=0.5'))

# Draw arrows between steps
for i in range(len(y_positions) - 1):
    plt.arrow(x_position, y_positions[i] - 0.05, 0, -0.08, head_width=0.02, head_length=0.02, fc='black', ec='black')

# Customize the plot
plt.title('Sentiment Analysis Pipeline for VAR Opinions', fontsize=16, pad=20)
plt.xlim(0, 1)
plt.ylim(-0.1, 1)
plt.axis('off')  # Hide axes
plt.show()

In [None]:
# Check if combined_df exists
if 'combined_df' in globals():
    # Drop rows with missing labels or scores
    df_eval = combined_df.dropna(subset=['Sentiment', 'Score'])

    # Define function to predict labels from sentiment scores (-1 to 1)
    def predict_label(score):
        if score < -0.1:
            return 'NEGATIVE'
        elif score > 0.1:
            return 'POSITIVE'
        else:
            return 'NEUTRAL'

    # True labels (standardize to uppercase for consistency)
    y_true = df_eval['Sentiment'].str.upper()

    # Predicted labels from sentiment scores
    y_pred = df_eval['Score'].apply(predict_label)

    # Calculate basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    # Print metrics
    print("Classification Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=['NEGATIVE', 'NEUTRAL', 'POSITIVE'])
    print("\nConfusion Matrix:")
    print(cm)

    # Display Confusion Matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['NEGATIVE', 'NEUTRAL', 'POSITIVE'])
    disp.plot(cmap='Blues')
    plt.title('Confusion Matrix for Sentiment Classification')
    plt.show()

    # Validation Score (using train-test split)
    X = df_eval[['Score']]  # Features (just the score for simplicity)
    y = y_true
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Predict on test set
    y_test_pred = X_test['Score'].apply(predict_label)

    # Validation accuracy
    validation_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"\nValidation Accuracy (20% test split): {validation_accuracy:.4f}")
else:
    print("Error: Combined dataframe not available. Run previous cells first.")