In [18]:
# we need to save Reddit data to your Google Drive, so connect that first
# the data is preprocessed weather related posts/comments from 2 state
# subreddits for each of Cold, Temperate, and Hot regions

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/
!git clone --no-checkout https://github.com/SYCqi/CIS-400-Social-Media-Data-Mining-Project.git
%cd /content/drive/MyDrive/CIS-400-Social-Media-Data-Mining-Project
!git sparse-checkout init --cone
!git sparse-checkout set ML and roBERTa Sentiment Analysis/preprocessedData
!git checkout
%cd ML_Manu/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive
fatal: destination path 'CIS-400-Social-Media-Data-Mining-Project' already exists and is not an empty directory.
/content/drive/MyDrive/CIS-400-Social-Media-Data-Mining-Project
Your branch is up to date with 'origin/main'.
/content/drive/MyDrive/CIS-400-Social-Media-Data-Mining-Project/ML and roBERTa Sentiment Analysis


In [None]:
# this script goes through the SQLite .db files in preprocessedData directory
# truncates for roBERTa model and performs sentiment analysis

import os
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, pipeline

model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_analyzer = pipeline("sentiment-analysis", model=model_name)

def truncate_and_analyze_sentiment(text, max_length=512):
  # roberta does pos/neg/neutral classification but has max token length of 512
  # ideally use sliding window to process but for expediency's sake I'll trunc
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_length)
    truncated_text = tokenizer.decode(tokens, skip_special_tokens=True)
    # Perform sentiment analysis
    result = sentiment_analyzer(truncated_text)
    # Map result to positive, neutral, negative
    sentiment = result[0]
    label_mapping = {"LABEL_0": "negative", "LABEL_1": "neutral", "LABEL_2": "positive"}

    sentiment_scores = {
        label_mapping.get(sentiment["label"], "unknown"): sentiment["score"]
    }

    return sentiment_scores

def process_database(file_path):
    conn = sqlite3.connect(file_path)
    query = "SELECT TEXT FROM RedditPosts"  # our data is in RedditPosts
    data = pd.read_sql_query(query, conn)
    conn.close()

    # preprocessedData should already be good
    data = data[data['TEXT'].notnull()]  # Remove nulls
    data = data[data['TEXT'].str.strip() != ""]  # Remove empty rows

    # Analyze sentiment w/ Hugging Face model
    sentiments = []
    for text in data['TEXT']:
        sentiment = truncate_and_analyze_sentiment(text)
        sentiments.append(sentiment)

    # Add sentiment scores to the DataFrame
    data['Positive'] = [s.get('positive', 0) for s in sentiments]
    data['Neutral'] = [s.get('neutral', 0) for s in sentiments]
    data['Negative'] = [s.get('negative', 0) for s in sentiments]

    # Return the data with sentiment scores
    return data

def process_directory(directory_path):
    all_data = pd.DataFrame()

    # Iterate through all files in the directory
    for file_name in os.listdir(directory_path):
        if file_name.endswith(".db"):  # Only process .db files
            db_path = os.path.join(directory_path, file_name)
            print(f"Processing file: {file_name}")

            # Process the database and get sentiment results
            region_data = process_database(db_path)

            # region and weather type)
            region, weather = file_name.replace(".db", "").split("_")
            region_data['Region'] = region
            region_data['Weather'] = weather

            # Append the region data to the all_data DataFrame
            all_data = pd.concat([all_data, region_data], ignore_index=True)

            # Plot results for the individual region
            plot_sentiments(region_data, region)

    # Plot the aggregated results for all regions
    plot_sentiments(all_data, "All Regions")

def plot_sentiments(data, region_name):
    # Aggregate scores by sentiment
    total_sentiments = data[['Positive', 'Neutral', 'Negative']].mean()

    cap_name = region_name.capitalize()
    # Plot a bar chart
    plt.figure(figsize=(8, 6))
    total_sentiments.plot(kind='bar', color=['green', 'grey', 'red'], alpha=0.7)
    plt.title(f"Average Sentiment Strength Scores for {cap_name} Region")
    plt.ylabel("Score")
    plt.xlabel("Sentiment Type")
    plt.xticks(rotation=0)
    plt.savefig(f"{region_name}_sentiment_bar_chart.png")
    plt.show()

    plt.figure(figsize=(6, 6))
    total_sentiments.plot(kind='pie', autopct='%1.1f%%', colors=['green', 'grey', 'red'], startangle=90)
    plt.title(f"Sentiment Distribution for {cap_name} Region")
    plt.ylabel("")  # Hide y-axis label
    plt.savefig(f"{region_name}_sentiment_pie_chart.png")
    plt.show()

if __name__ == "__main__":
    directory_path = "preprocessedData"  # replace if using a different dir
    process_directory(directory_path)

Processing file: cold_cloudy.db
