<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/src/llama3_2_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using L4 GPU

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

In [None]:
# Install required packages
!pip uninstall -y transformers
!pip install transformers==4.44.0
!pip install datasets scikit-learn matplotlib torch torchvision torchaudio
!pip install accelerate bitsandbytes

# Fine-tuning model

# 1,050

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_csv_file(file_path):
    try:
        data = pd.read_csv(file_path, names=['body', 'inflation'], header=0, dtype='object')
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Define the IMF economist prompt (same as used in training)
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post: {post}

Classification:"""

def format_with_prompt(post):
    return INFLATION_PROMPT.format(post=post)

# Test data file path
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

# Read data from CSV file
test_data = read_csv_file(file_path)

if test_data is not None:
    # Format test data with the same prompt used during training
    test_data['formatted_body'] = test_data['body'].apply(format_with_prompt)

    # Initialize the tokenizer for Llama
    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B')

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the test data using formatted_body
    test_encodings = tokenizer(
        test_data['formatted_body'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # Convert the string labels to integers
    test_labels = [int(label) for label in test_data['inflation']]

    # Create the test dataset
    test_dataset = TestDataset(test_encodings, test_labels)

    # Initialize the fine-tuned Llama model
    model = AutoModelForSequenceClassification.from_pretrained(
        '/content/drive/MyDrive/world-inflation/data/model/llama-3.2-3b-fine-tuning/checkpoint-144/',
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    # Set model to evaluation mode
    model.eval()

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=8)  # Smaller batch size for Llama

    # Lists to store true and predicted labels
    true_labels = []
    predicted_labels = []

    print("Starting evaluation...")

    # Use the model to predict the labels of the test data
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(model.device)

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            true_labels.extend(labels.cpu().tolist())
            predicted_labels.extend(predictions.cpu().tolist())

            # Print progress
            if (batch_idx + 1) % 5 == 0:
                print(f"Processed {(batch_idx + 1) * 8} samples...")

    # Calculate and display accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels, average=None)
    precision = precision_score(true_labels, predicted_labels, average=None)
    f1 = f1_score(true_labels, predicted_labels, average=None)

    # Display classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    # Display metrics for each class and macro/micro averages
    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(3):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")

    print(f"\nTotal test samples: {len(true_labels)}")
    print("Evaluation completed successfully!")
else:
    print("Failed to load test data. Please check the file path and format.")

# 520

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_csv_file(file_path):
    try:
        data = pd.read_csv(file_path, names=['body', 'inflation'], header=0, dtype='object')
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Define the IMF economist prompt (same as used in training)
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post: {post}

Classification:"""

def format_with_prompt(post):
    return INFLATION_PROMPT.format(post=post)

# Test data file path
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

# Read data from CSV file
test_data = read_csv_file(file_path)

if test_data is not None:
    # Format test data with the same prompt used during training
    test_data['formatted_body'] = test_data['body'].apply(format_with_prompt)

    # Initialize the tokenizer for Llama
    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B')

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the test data using formatted_body
    test_encodings = tokenizer(
        test_data['formatted_body'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # Convert the string labels to integers
    test_labels = [int(label) for label in test_data['inflation']]

    # Create the test dataset
    test_dataset = TestDataset(test_encodings, test_labels)

    # Initialize the fine-tuned Llama model
    model = AutoModelForSequenceClassification.from_pretrained(
        '/content/drive/MyDrive/world-inflation/data/model/llama-3.2-3b-fine-tuning-520/checkpoint-96/',
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    # Set model to evaluation mode
    model.eval()

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=8)  # Smaller batch size for Llama

    # Lists to store true and predicted labels
    true_labels = []
    predicted_labels = []

    print("Starting evaluation...")

    # Use the model to predict the labels of the test data
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(model.device)

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            true_labels.extend(labels.cpu().tolist())
            predicted_labels.extend(predictions.cpu().tolist())

            # Print progress
            if (batch_idx + 1) % 5 == 0:
                print(f"Processed {(batch_idx + 1) * 8} samples...")

    # Calculate and display accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels, average=None)
    precision = precision_score(true_labels, predicted_labels, average=None)
    f1 = f1_score(true_labels, predicted_labels, average=None)

    # Display classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    # Display metrics for each class and macro/micro averages
    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(3):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")

    print(f"\nTotal test samples: {len(true_labels)}")
    print("Evaluation completed successfully!")
else:
    print("Failed to load test data. Please check the file path and format.")

# 260

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_csv_file(file_path):
    try:
        data = pd.read_csv(file_path, names=['body', 'inflation'], header=0, dtype='object')
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Define the IMF economist prompt (same as used in training)
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post: {post}

Classification:"""

def format_with_prompt(post):
    return INFLATION_PROMPT.format(post=post)

# Test data file path
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

# Read data from CSV file
test_data = read_csv_file(file_path)

if test_data is not None:
    # Format test data with the same prompt used during training
    test_data['formatted_body'] = test_data['body'].apply(format_with_prompt)

    # Initialize the tokenizer for Llama
    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B')

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the test data using formatted_body
    test_encodings = tokenizer(
        test_data['formatted_body'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # Convert the string labels to integers
    test_labels = [int(label) for label in test_data['inflation']]

    # Create the test dataset
    test_dataset = TestDataset(test_encodings, test_labels)

    # Initialize the fine-tuned Llama model
    model = AutoModelForSequenceClassification.from_pretrained(
        '/content/drive/MyDrive/world-inflation/data/model/llama-3.2-3b-fine-tuning-260/checkpoint-36/',
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    # Set model to evaluation mode
    model.eval()

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=8)  # Smaller batch size for Llama

    # Lists to store true and predicted labels
    true_labels = []
    predicted_labels = []

    print("Starting evaluation...")

    # Use the model to predict the labels of the test data
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(model.device)

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            true_labels.extend(labels.cpu().tolist())
            predicted_labels.extend(predictions.cpu().tolist())

            # Print progress
            if (batch_idx + 1) % 5 == 0:
                print(f"Processed {(batch_idx + 1) * 8} samples...")

    # Calculate and display accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels, average=None)
    precision = precision_score(true_labels, predicted_labels, average=None)
    f1 = f1_score(true_labels, predicted_labels, average=None)

    # Display classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    # Display metrics for each class and macro/micro averages
    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(3):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")

    print(f"\nTotal test samples: {len(true_labels)}")
    print("Evaluation completed successfully!")
else:
    print("Failed to load test data. Please check the file path and format.")

# 130

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_csv_file(file_path):
    try:
        data = pd.read_csv(file_path, names=['body', 'inflation'], header=0, dtype='object')
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Define the IMF economist prompt (same as used in training)
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post: {post}

Classification:"""

def format_with_prompt(post):
    return INFLATION_PROMPT.format(post=post)

# Test data file path
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

# Read data from CSV file
test_data = read_csv_file(file_path)

if test_data is not None:
    # Format test data with the same prompt used during training
    test_data['formatted_body'] = test_data['body'].apply(format_with_prompt)

    # Initialize the tokenizer for Llama
    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B')

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the test data using formatted_body
    test_encodings = tokenizer(
        test_data['formatted_body'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # Convert the string labels to integers
    test_labels = [int(label) for label in test_data['inflation']]

    # Create the test dataset
    test_dataset = TestDataset(test_encodings, test_labels)

    # Initialize the fine-tuned Llama model
    model = AutoModelForSequenceClassification.from_pretrained(
        '/content/drive/MyDrive/world-inflation/data/model/llama-3.2-3b-fine-tuning-130/checkpoint-24/',
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    # Set model to evaluation mode
    model.eval()

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=8)  # Smaller batch size for Llama

    # Lists to store true and predicted labels
    true_labels = []
    predicted_labels = []

    print("Starting evaluation...")

    # Use the model to predict the labels of the test data
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(model.device)

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            true_labels.extend(labels.cpu().tolist())
            predicted_labels.extend(predictions.cpu().tolist())

            # Print progress
            if (batch_idx + 1) % 5 == 0:
                print(f"Processed {(batch_idx + 1) * 8} samples...")

    # Calculate and display accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels, average=None)
    precision = precision_score(true_labels, predicted_labels, average=None)
    f1 = f1_score(true_labels, predicted_labels, average=None)

    # Display classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    # Display metrics for each class and macro/micro averages
    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(3):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")

    print(f"\nTotal test samples: {len(true_labels)}")
    print("Evaluation completed successfully!")
else:
    print("Failed to load test data. Please check the file path and format.")

# 65

In [None]:
import torch
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_csv_file(file_path):
    try:
        data = pd.read_csv(file_path, names=['body', 'inflation'], header=0, dtype='object')
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Define the IMF economist prompt (same as used in training)
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post: {post}

Classification:"""

def format_with_prompt(post):
    return INFLATION_PROMPT.format(post=post)

# Test data file path
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

# Read data from CSV file
test_data = read_csv_file(file_path)

if test_data is not None:
    # Format test data with the same prompt used during training
    test_data['formatted_body'] = test_data['body'].apply(format_with_prompt)

    # Initialize the tokenizer for Llama
    tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B')

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Encode the test data using formatted_body
    test_encodings = tokenizer(
        test_data['formatted_body'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    # Convert the string labels to integers
    test_labels = [int(label) for label in test_data['inflation']]

    # Create the test dataset
    test_dataset = TestDataset(test_encodings, test_labels)

    # Initialize the fine-tuned Llama model
    model = AutoModelForSequenceClassification.from_pretrained(
        '/content/drive/MyDrive/world-inflation/data/model/llama-3.2-3b-fine-tuning-65/checkpoint-12/',
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    # Set model to evaluation mode
    model.eval()

    # Create a DataLoader for the test dataset
    test_loader = DataLoader(test_dataset, batch_size=8)  # Smaller batch size for Llama

    # Lists to store true and predicted labels
    true_labels = []
    predicted_labels = []

    print("Starting evaluation...")

    # Use the model to predict the labels of the test data
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            inputs = {key: val.to(model.device) for key, val in batch.items() if key != 'labels'}
            labels = batch['labels'].to(model.device)

            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=-1)

            true_labels.extend(labels.cpu().tolist())
            predicted_labels.extend(predictions.cpu().tolist())

            # Print progress
            if (batch_idx + 1) % 5 == 0:
                print(f"Processed {(batch_idx + 1) * 8} samples...")

    # Calculate and display accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels, average=None)
    precision = precision_score(true_labels, predicted_labels, average=None)
    f1 = f1_score(true_labels, predicted_labels, average=None)

    # Display classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    # Display metrics for each class and macro/micro averages
    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(3):
        print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Macro Average|    {accuracy:.2f}   |   {recall.mean():.2f}   |   {precision.mean():.2f}   |   {f1.mean():.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")
    print(f"| Micro Average|    {accuracy:.2f}   |   {recall.sum()/3:.2f}   |   {precision.sum()/3:.2f}   |   {f1.sum()/3:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")

    print(f"\nTotal test samples: {len(true_labels)}")
    print("Evaluation completed successfully!")
else:
    print("Failed to load test data. Please check the file path and format.")

# Zero-shot model

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix
import re
from tqdm import tqdm

def read_csv_file(file_path):
    try:
        data = pd.read_csv(file_path, names=['body', 'inflation'], header=0, dtype='object')
        return data
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Define the IMF economist prompt (same as used in training)
INFLATION_PROMPT = """You are a chief economist at the IMF. I would like you to infer the public perception of inflation from Reddit posts. Please classify each Reddit post into one of the following categories: 0: The post indicates deflation, such as the lower price of goods or services (e.g., "the prices are not bad"), affordable services (e.g., "this champagne is cheap and delicious"), sales information (e.g., "you can get it for only 10 dollars."), or a declining and buyer's market. 2: The post indicates or includes inflation, such as the higher price of goods or services (e.g., "it's not cheap"), the unreasonable cost of goods or services (e.g., "the food is overpriced and cold"), consumers struggling to afford necessities (e.g., "items are too expensive to buy"), shortage of goods of services, or mention about an asset bubble. 1: The post indicates neither deflation (0) nor inflation (2). This category also includes just questions to a community, social statements not personal experience, factual observations, references to originally expensive or cheap goods or services (e.g., "a gorgeous and costly dinner" or "an affordable Civic"), website promotion, authors' wishes, or illogical text. Please choose a stronger stance when the text includes both 0 and 2 stances. If these stances are of the same degree, answer 1.

Reddit Post: {post}

Classification:"""

def format_with_prompt(post):
    return INFLATION_PROMPT.format(post=post)

def extract_classification(response_text):
    """
    Extract classification from model response.
    Looks for patterns like "Classification: 0", "0", "2:", etc.
    """
    # Remove the input prompt to focus on the generated response
    if "Classification:" in response_text:
        response_part = response_text.split("Classification:")[-1].strip()
    else:
        response_part = response_text

    # Look for explicit classification patterns
    patterns = [
        r'Classification:\s*([012])',
        r'^([012])(?:\s|$|\.)',
        r'\b([012])\b',
    ]

    for pattern in patterns:
        match = re.search(pattern, response_part)
        if match:
            return int(match.group(1))

    # If no clear pattern found, look for the first occurrence of 0, 1, or 2
    for char in response_part:
        if char in ['0', '1', '2']:
            return int(char)

    # Default to 1 (neutral) if no classification found
    return 1

def generate_prediction(model, tokenizer, prompt, max_new_tokens=50):
    """
    Generate prediction using the model in zero-shot setting.
    """
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,  # Low temperature for more deterministic outputs
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode only the generated part (excluding the input prompt)
    generated_text = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return generated_text

In [None]:
# Test data file path
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv'

# Read data from CSV file
test_data = read_csv_file(file_path)

if test_data is not None:
    print(f"Loaded {len(test_data)} test samples")

    # Format test data with the same prompt used during training
    test_data['formatted_body'] = test_data['body'].apply(format_with_prompt)

    # Initialize the base Llama model and tokenizer
    model_name = 'meta-llama/Llama-3.2-3B'
    print(f"Loading base model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    # Add padding token if it doesn't exist
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Set model to evaluation mode
    model.eval()

    # Convert the string labels to integers
    test_labels = [int(label) for label in test_data['inflation']]

    # Lists to store true and predicted labels
    true_labels = test_labels
    predicted_labels = []

    print("Starting zero-shot evaluation...")

    # Process each sample individually for zero-shot inference
    for idx, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Processing samples"):
        prompt = row['formatted_body']

        # Generate prediction
        response = generate_prediction(model, tokenizer, prompt)

        # Extract classification from response
        prediction = extract_classification(response)
        predicted_labels.append(prediction)

        # Print first few examples for debugging
        if idx < 5:
            print(f"\n--- Sample {idx + 1} ---")
            print(f"True label: {test_labels[idx]}")
            print(f"Generated response: {response[:100]}...")
            print(f"Extracted prediction: {prediction}")

    # Calculate and display accuracy, recall, precision, and F1 score
    accuracy = accuracy_score(true_labels, predicted_labels)
    recall = recall_score(true_labels, predicted_labels, average=None, zero_division=0)
    precision = precision_score(true_labels, predicted_labels, average=None, zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, average=None, zero_division=0)

    # Display classification report and confusion matrix
    print("\n" + "="*50)
    print("ZERO-SHOT EVALUATION RESULTS")
    print("="*50)
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels, zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(true_labels, predicted_labels))

    # Display metrics for each class and macro/micro averages
    print("\n+--------------+-----------+----------+----------+----------+")
    print("|   Metric     | Accuracy  |  Recall  | Precision|  F1 Score |")
    print("+--------------+-----------+----------+----------+----------+")
    for i in range(3):
        if i < len(recall):
            print(f"| Class {i}      |    {accuracy:.2f}   |   {recall[i]:.2f}   |   {precision[i]:.2f}   |   {f1[i]:.2f}   |")
        else:
            print(f"| Class {i}      |    {accuracy:.2f}   |   0.00   |   0.00   |   0.00   |")
    print("+--------------+-----------+----------+----------+----------+")

    # Calculate macro averages (handling potential empty arrays)
    macro_recall = recall.mean() if len(recall) > 0 else 0.0
    macro_precision = precision.mean() if len(precision) > 0 else 0.0
    macro_f1 = f1.mean() if len(f1) > 0 else 0.0

    print(f"| Macro Average|    {accuracy:.2f}   |   {macro_recall:.2f}   |   {macro_precision:.2f}   |   {macro_f1:.2f}   |")

    # Calculate micro averages
    micro_recall = recall_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_precision = precision_score(true_labels, predicted_labels, average='micro', zero_division=0)
    micro_f1 = f1_score(true_labels, predicted_labels, average='micro', zero_division=0)

    print(f"| Micro Average|    {accuracy:.2f}   |   {micro_recall:.2f}   |   {micro_precision:.2f}   |   {micro_f1:.2f}   |")
    print("+--------------+-----------+----------+----------+----------+")

    print(f"\nTotal test samples: {len(true_labels)}")

    # Print distribution of predictions
    from collections import Counter
    true_dist = Counter(true_labels)
    pred_dist = Counter(predicted_labels)

    print("\nLabel Distribution:")
    print(f"True labels:      {dict(true_dist)}")
    print(f"Predicted labels: {dict(pred_dist)}")

    print("\nZero-shot evaluation completed successfully!")
    print("\nNote: This uses the base Llama-3.2-3B model without fine-tuning.")
    print("Compare these results with your fine-tuned model performance.")

else:
    print("Failed to load test data. Please check the file path and format.")