**Essential libraries**

In [None]:
pip install transformers
pip install torch
pip install pandas
pip install scipy

**Data loading and preprocessing**

*   optional, if not done before




In [None]:
import pandas as pd
import re

def preprocess_text(text):
    text = str(text).lower()                 # Lowercase
    text = re.sub(r'http\S+', '', text)     # Remove URLs
    text = re.sub(r'@\w+', '', text)        # Remove mentions
    text = re.sub(r'#\w+', '', text)        # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text)    # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# Load your data (assuming you have a CSV)
# You would have already filtered this for tariff/industry keywords
df = pd.read_csv("your_filtered_text_data.csv")

# Apply the preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

print("Preprocessing complete.")
print(df[['text', 'processed_text']].head())

**Load the Pre-trained ProsusAI/FinBERT Model**

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "ProsusAI/finbert"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

print(f"Loaded model: {MODEL_NAME}")

**Creating the Sentiment Analysis Function**

In [None]:
import torch
from scipy.special import softmax

def get_finbert_sentiment(text_list):
    """
    Analyzes a list of texts and returns sentiment details.
    """

    # We run this in "no_grad" mode, as we are not training the model
    with torch.no_grad():

        # 1. Tokenize the text
        inputs = tokenizer(text_list,
                           padding=True,
                           truncation=True,
                           max_length=128,         # change len if needed
                           return_tensors="pt")

        # 2. Get model outputs (logits)
        outputs = model(**inputs)

        # 3. Convert logits to probabilities (0-1) using softmax
        scores = softmax(outputs.logits.numpy(), axis=1)

    # 4. Map probabilities to labels
    labels = model.config.id2label
    results = []

    for score in scores:
        # Get the scores for each class
        prob_pos = score[labels.index('positive')]
        prob_neg = score[labels.index('negative')]
        prob_neu = score[labels.index('neutral')]

        # Get the final predicted label
        pred_label = labels[score.argmax()]

        # 5. Calculate the continuous score as per your plan
        # Formula: (+1 * P(pos)) - (1 * P(neg))
        sentiment_score = (1 * prob_pos) - (1 * prob_neg)

        results.append({
            'label': pred_label,
            'sentiment_score': sentiment_score,
            'positive_prob': prob_pos,
            'negative_prob': prob_neg,
            'neutral_prob': prob_neu
        })

    return results

**Run Analysis and Aggregate to Daily Index**

In [None]:
# 1. Get a list of all texts to analyze
# (Running in batches is much faster than one by one)
texts_to_analyze = df['processed_text'].tolist()

# 2. Run the sentiment analysis
print("Running FinBERT analysis... This may take a while.")
sentiment_results = get_finbert_sentiment(texts_to_analyze)
print("Analysis complete.")

# 3. Add results back to your DataFrame
sentiment_df = pd.DataFrame(sentiment_results)
df = pd.concat([df.reset_index(drop=True), sentiment_df], axis=1)

# 4. Create the final Daily Sentiment Index
# Ensure your date column is in datetime format
df['doc_date'] = pd.to_datetime(df['doc_date'])

# Aggregate by industry and date, calculating the mean score
daily_sentiment_index = df.groupby(
    ['industry', df['doc_date'].dt.date]
)['sentiment_score'].mean().reset_index()

# Rename for clarity
daily_sentiment_index = daily_sentiment_index.rename(
    columns={'doc_date': 'date', 'sentiment_score': 'S_i_t'}
)

# 5. Save your final index
daily_sentiment_index.to_csv("daily_industry_sentiment_index.csv", index=False)

print("\n--- Daily Industry Sentiment Index (S_i,t) ---")
print(daily_sentiment_index.head())

This outputs a CSV file (daily_industry_sentiment_index.csv) with three columns: industry, date, and S_i_t.

This S_i_t (sentiment index) is the exact variable  needed to proceed with


Plotting :  "Daily sentiment index... with vertical lines for tariff dates".

