In [None]:
!pip install word2number

Collecting word2number
  Downloading word2number-1.1.zip (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: word2number
  Building wheel for word2number (setup.py) ... [?25l[?25hdone
  Created wheel for word2number: filename=word2number-1.1-py3-none-any.whl size=5568 sha256=034af0bd929644cd8be142c7e78b1e8abdb6f9aad5837f52d25a82dc713353a0
  Stored in directory: /root/.cache/pip/wheels/84/ff/26/d3cfbd971e96c5aa3737ecfced81628830d7359b55fbb8ca3b
Successfully built word2number
Installing collected packages: word2number
Successfully installed word2number-1.1


In [None]:
import joblib
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from word2number import w2n
import spacy

In [3]:
loaded_data = joblib.load('sentiment_model.pkl')
model = loaded_data['model']
tokenizer = loaded_data['tokenizer']
bert_model = loaded_data['bert_model']
max_len = loaded_data['max_len']

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

In [6]:
def get_bert_embeddings(sentences, tokenizer, model):
    inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=50)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

In [7]:
def parse_numerical_info(sentence):
    doc = nlp(sentence)
    numerical_info = []
    for token in doc:
        if token.like_num:
            try:
                num = w2n.word_to_num(token.text)
                numerical_info.append(num)
            except ValueError:
                continue
    return np.array(numerical_info) if numerical_info else np.zeros(3)

In [8]:
def combine_features(bert_embedding, numerical_values, aspect):
    if len(numerical_values) < 3:
        numerical_values = np.pad(numerical_values, (0, 3 - len(numerical_values)), 'constant')
    aspect_embedding = get_bert_embeddings([aspect], tokenizer, bert_model).flatten()
    combined = np.concatenate([bert_embedding, numerical_values, aspect_embedding])
    combined = combined[:max_len]
    if len(combined) < max_len:
        combined = np.pad(combined, (0, max_len - len(combined)), 'constant')
    return combined

def predict_sentiment(sentences, aspects):
    features = []
    for sentence, aspect in zip(sentences, aspects):
        preprocessed_sentence = preprocess_text(sentence)
        bert_embedding = get_bert_embeddings([preprocessed_sentence], tokenizer, bert_model).flatten()
        numerical_info = parse_numerical_info(preprocessed_sentence)
        combined_features = combine_features(bert_embedding, numerical_info, aspect)
        features.append(combined_features)
    features = np.vstack(features)
    predictions = model.predict(features).flatten()
    return predictions

In [16]:
sentences = ["Stakes High for AstraZeneca Heart Drug Facing Tough Competition",
             "CBI books Adani Enterprises, 3 ex-NCCF officials for alleged irregularities in coal supply contract",
             "AstraZeneca shares climb 3% as drug maker ups profits forecasts"]

aspects = ["AstraZeneca","Adani Enterprises","AstraZeneca"]

predicted_scores = predict_sentiment(sentences, aspects)
for i, score in enumerate(predicted_scores):
    print(f"Sentence: {sentences[i]}")
    print(f"Aspect: {aspects[i]}")
    print(f"Predicted Sentiment Score: {score}")
    print()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Sentence: Stakes High for AstraZeneca Heart Drug Facing Tough Competition
Aspect: AstraZeneca
Predicted Sentiment Score: -0.3016662299633026

Sentence: CBI books Adani Enterprises, 3 ex-NCCF officials for alleged irregularities in coal supply contract
Aspect: Adani Enterprises
Predicted Sentiment Score: -0.3093346953392029

Sentence: AstraZeneca shares climb 3% as drug maker ups profits forecasts
Aspect: AstraZeneca
Predicted Sentiment Score: 0.22861944139003754

