<a href="https://colab.research.google.com/github/Ranger3560/SentimentAnalysis/blob/main/XLM_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
#Corteva Sentiment Analysis using XLM-RoBERTa Fine-tuned Model
import csv
from transformers import pipeline
from datetime import datetime
import re

# Initialize sentiment analysis pipeline
_sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")

#Predicts sentiment for the given text and assigns a normalized score based on the predicted label and the user-defined ranges.
def predict_sentiment(text: str):
    try:
        if not text.strip():
            # Assign neutral sentiment for empty reviews
            return {"label": "NEUTRAL", "score": 0.6} # Default score in neutral range

        # Get sentiment analysis result from the pipeline
        result = _sentiment_pipeline(text)[0]
        raw_label = result['label'].upper()
        confidence_score = result['score'] # This is the confidence of the predicted label

        # Mapping standard labels (e.g., LABEL_0, LABEL_1, LABEL_2) to standard sentiment names
        label_map = {
            "NEGATIVE": "NEGATIVE",
            "NEUTRAL": "NEUTRAL",
            "POSITIVE": "POSITIVE",
            "LABEL_0": "NEGATIVE",
            "LABEL_1": "NEUTRAL",
            "LABEL_2": "POSITIVE"
        }
        mapped_label = label_map.get(raw_label, raw_label)

        # Calculate normalized score based on user requirements

        if mapped_label == "POSITIVE":
            normalized_score = 0.7 + (confidence_score * 0.3)
        elif mapped_label == "NEUTRAL":
            normalized_score = 0.5 + (confidence_score * 0.2)
        elif mapped_label == "NEGATIVE":
            normalized_score = 0.5 - (confidence_score * 0.5)
        else:
            mapped_label = "NEUTRAL"
            normalized_score = 0.6

        # Ensure the score is within the 0 to 1 range
        normalized_score = max(0.0, min(1.0, normalized_score))

        return {"label": mapped_label, "score": normalized_score}
    except Exception as e:
        print(f"Error analyzing sentiment for text '{text[:50]}...': {e}")
        return {"label": "ERROR", "score": 0.0}

#Loads reviews from the input CSV file.

def load_reviews(file_path: str) -> list[dict]:
    reviews = []
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            reader = csv.DictReader(file)
            for row in reader:
                reviews.append({
                    # Read all specified columns
                    "date": row.get("Date", ""),
                    "verified": row.get("Verified", ""),
                    "product_id": row.get("ProductID", ""),
                    "rating": row.get("Rating", ""),
                    "title": row.get("Title", ""),
                    "review_text": row.get("ReviewText", ""),
                    "review_id": row.get("ReviewID", "")
                })
        print(f"Successfully loaded {len(reviews)} reviews from {file_path}")
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except Exception as e:
        print(f"Error reading file '{file_path}': {e}")
    return reviews

def analyze_reviews(input_file: str, output_file: str):
    reviews = load_reviews(input_file)

    if not reviews:
        print(f"No reviews found in {input_file}. Exiting analysis.")
        return

    with open(output_file, "w", encoding="utf-8", newline="") as outfile:
        # Define output columns including SentimentLabel, and SentimentScore
        fieldnames = [
            "Date", "Verified", "ProductID", "Rating", "Title",
            "ReviewText", "ReviewID", "SentimentLabel", "SentimentScore"
        ]
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"\n--- Product Review Analysis - Start Time: {current_date_time} ---\n")
        print(f"Processing {len(reviews)} reviews...")

        for review in reviews:
            # Extract relevant information
            text = review.get("review_text", "")
            rating = review.get("rating", "")
            verified = review.get("verified", "")
            product_id = review.get("product_id", "")
            title = review.get("title", "")
            review_id = review.get("review_id", "")

            # Use the existing 'Date' column from the input
            date = review.get("date", "")

            # Basic text cleaning (removing extra whitespace and quotes)
            clean_text = ' '.join(text.split()).replace('"', '')

            # Predict sentiment and get normalized score
            sentiment_result = predict_sentiment(clean_text)
            sentiment_label = sentiment_result["label"]
            sentiment_score = sentiment_result["score"]

            # Write the results to the output CSV
            writer.writerow({
                "Date": date,
                "ProductID": product_id,
                "Verified": verified,
                "Rating": rating,
                "Title": title,
                "ReviewText": text,
                "ReviewID": review_id,
                "SentimentLabel": sentiment_label,
                "SentimentScore": f"{sentiment_score:.4f}"
            })

    print(f"\nAnalysis complete. Results saved to '{output_file}'")

if __name__ == "__main__":
    input_file = "/content/Corteva_Extrapolated_Dataset.csv"
    output_file = "XLM_Corteva_Output_Sentiment_Analysis.csv"
    analyze_reviews(input_file, output_file)

Device set to use cpu


Successfully loaded 1000 reviews from /content/Corteva_Extrapolated_Dataset.csv

--- Product Review Analysis - Start Time: 2025-07-12 12:11:59 ---

Processing 1000 reviews...

Analysis complete. Results saved to 'XLM_Corteva_Output_Sentiment_Analysis.csv'


In [6]:
#Syngenta Sentiment Analysis using XLM-RoBERTa Fine-tuned Model
import csv
from transformers import pipeline
from datetime import datetime
import re

# Initialize sentiment analysis pipeline
_sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")

#Predicts sentiment for the given text and assigns a normalized score based on the predicted label and the user-defined ranges.
def predict_sentiment(text: str):
    try:
        if not text.strip():
            # Assign neutral sentiment for empty reviews
            return {"label": "NEUTRAL", "score": 0.6} # Default score in neutral range

        # Get sentiment analysis result from the pipeline
        result = _sentiment_pipeline(text)[0]
        raw_label = result['label'].upper()
        confidence_score = result['score'] # This is the confidence of the predicted label

        # Mapping standard labels (e.g., LABEL_0, LABEL_1, LABEL_2) to standard sentiment names
        label_map = {
            "NEGATIVE": "NEGATIVE",
            "NEUTRAL": "NEUTRAL",
            "POSITIVE": "POSITIVE",
            "LABEL_0": "NEGATIVE",
            "LABEL_1": "NEUTRAL",
            "LABEL_2": "POSITIVE"
        }
        mapped_label = label_map.get(raw_label, raw_label)

        # Calculate normalized score based on user requirements

        if mapped_label == "POSITIVE":
            normalized_score = 0.7 + (confidence_score * 0.3)
        elif mapped_label == "NEUTRAL":
            normalized_score = 0.5 + (confidence_score * 0.2)
        elif mapped_label == "NEGATIVE":
            normalized_score = 0.5 - (confidence_score * 0.5)
        else:
            mapped_label = "NEUTRAL"
            normalized_score = 0.6

        # Ensure the score is within the 0 to 1 range
        normalized_score = max(0.0, min(1.0, normalized_score))

        return {"label": mapped_label, "score": normalized_score}
    except Exception as e:
        print(f"Error analyzing sentiment for text '{text[:50]}...': {e}")
        return {"label": "ERROR", "score": 0.0}

#Loads reviews from the input CSV file.

def load_reviews(file_path: str) -> list[dict]:
    reviews = []
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            reader = csv.DictReader(file)
            for row in reader:
                reviews.append({
                    # Read all specified columns
                    "date": row.get("Date", ""),
                    "verified": row.get("Verified", ""),
                    "product_id": row.get("ProductID", ""),
                    "rating": row.get("Rating", ""),
                    "title": row.get("Title", ""),
                    "review_text": row.get("ReviewText", ""),
                    "review_id": row.get("ReviewID", "")
                })
        print(f"Successfully loaded {len(reviews)} reviews from {file_path}")
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
    except Exception as e:
        print(f"Error reading file '{file_path}': {e}")
    return reviews

def analyze_reviews(input_file: str, output_file: str):
    reviews = load_reviews(input_file)

    if not reviews:
        print(f"No reviews found in {input_file}. Exiting analysis.")
        return

    with open(output_file, "w", encoding="utf-8", newline="") as outfile:
        # Define output columns including SentimentLabel, and SentimentScore
        fieldnames = [
            "Date", "Verified", "ProductID", "Rating", "Title",
            "ReviewText", "ReviewID", "SentimentLabel", "SentimentScore"
        ]
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"\n--- Product Review Analysis - Start Time: {current_date_time} ---\n")
        print(f"Processing {len(reviews)} reviews...")

        for review in reviews:
            # Extract relevant information
            text = review.get("review_text", "")
            rating = review.get("rating", "")
            verified = review.get("verified", "")
            product_id = review.get("product_id", "")
            title = review.get("title", "")
            review_id = review.get("review_id", "")

            # Use the existing 'Date' column from the input
            date = review.get("date", "")

            # Basic text cleaning (removing extra whitespace and quotes)
            clean_text = ' '.join(text.split()).replace('"', '')

            # Predict sentiment and get normalized score
            sentiment_result = predict_sentiment(clean_text)
            sentiment_label = sentiment_result["label"]
            sentiment_score = sentiment_result["score"]

            # Write the results to the output CSV
            writer.writerow({
                "Date": date,
                "ProductID": product_id,
                "Verified": verified,
                "Rating": rating,
                "Title": title,
                "ReviewText": text,
                "ReviewID": review_id,
                "SentimentLabel": sentiment_label,
                "SentimentScore": f"{sentiment_score:.4f}"
            })

    print(f"\nAnalysis complete. Results saved to '{output_file}'")

if __name__ == "__main__":
    input_file = "/content/Syngenta_Extrapolated_Dataset.csv"
    output_file = "XLM_Syngenta_Output_Sentiment_Analysis.csv"
    analyze_reviews(input_file, output_file)

Device set to use cpu


Successfully loaded 1000 reviews from /content/Syngenta_Extrapolated_Dataset.csv

--- Product Review Analysis - Start Time: 2025-07-12 12:21:41 ---

Processing 1000 reviews...

Analysis complete. Results saved to 'XLM_Syngenta_Output_Sentiment_Analysis.csv'
