In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
model_name = "raruidol/ArgumentMining-EN-ARI-Fin-Essay"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [20]:
import pandas as pd
df = pd.read_csv("data/gpt_fake_recommendations.csv")

In [3]:
# Load your DataFrame and select the column with the text
text_data = df['Commentary'].tolist()

In [4]:
text_data

["Despite promising growth in cost management, concerns about technological innovation suggest a hold on AAPL. The company's performance in technology will likely depend on how well it navigates these issues. Hold recommendation, with a target price of 121.35.",
 'AAPL is benefiting from investment in research and development and strong demand. This positions the company for further growth, especially with strong tailwinds in technology. Analysts recommend a buy, with a target price of 371.59.',
 "AAPL's performance in technology has been steady, but risks surrounding technological innovation and expansion into international markets make it advisable to hold for now. The target price is set at 202.52.",
 'AAPL faces uncertainties in shift in consumer preferences and challenges from strong demand, leading analysts to recommend holding for now. While long-term prospects in technology remain strong, the near-term outlook warrants caution. Target price: 355.88.',
 "AAPL's challenges in exp

In [5]:
# Tokenize the text data
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

# Tokenize the text column
tokenized_texts = tokenize_texts(text_data)


In [6]:
tokenized_texts

{'input_ids': tensor([[    0,   262, 61518,  ...,     1,     1,     1],
        [    0, 56558, 21130,  ...,     1,     1,     1],
        [    0, 56558, 21130,  ...,     1,     1,     1],
        ...,
        [    0, 29426,   441,  ...,     1,     1,     1],
        [    0, 29426,   441,  ...,     1,     1,     1],
        [    0, 29426,   441,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [7]:
def predict_arguments(tokenized_texts):
    with torch.no_grad():
        input_ids = tokenized_texts['input_ids'].to(device)
        attention_mask = tokenized_texts['attention_mask'].to(device)
        
        # Run the model on the tokenized text
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Apply softmax to get probabilities
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # Get the predicted class (e.g., 0 for premise, 1 for claim)
        predictions = torch.argmax(probs, dim=1)
        
        return predictions.cpu().numpy()

# Run the predictions
predictions = predict_arguments(tokenized_texts)


In [8]:
predictions

array([2, 0, 0, ..., 0, 0, 0])

In [9]:
# Add predictions to the DataFrame
df['argument_type'] = ['Claim' if pred == 1 else 'Premise' for pred in predictions]

In [10]:
df['stance'] = ['Positive' if pred == 2 else 'Negative' for pred in predictions]

In [11]:
df

Unnamed: 0,Company,Short Name,Date,Target Price,Recommendation,Commentary,argument_type,stance
0,Apple Inc.,AAPL,2023-01-15,121.35,Hold,"Despite promising growth in cost management, c...",Premise,Positive
1,Apple Inc.,AAPL,2023-02-15,371.59,Buy,AAPL is benefiting from investment in research...,Premise,Negative
2,Apple Inc.,AAPL,2023-03-15,202.52,Hold,AAPL's performance in technology has been stea...,Premise,Negative
3,Apple Inc.,AAPL,2023-04-15,355.88,Hold,AAPL faces uncertainties in shift in consumer ...,Premise,Positive
4,Apple Inc.,AAPL,2023-05-15,292.45,Sell,AAPL's challenges in expansion into internatio...,Claim,Negative
...,...,...,...,...,...,...,...,...
1231,Exelon Corporation,EXC,2023-08-15,242.37,Sell,With ongoing issues like regulatory pressure a...,Premise,Negative
1232,Exelon Corporation,EXC,2023-09-15,248.48,Buy,EXC's continued growth in environmental regula...,Premise,Negative
1233,Exelon Corporation,EXC,2023-10-15,148.36,Hold,EXC faces uncertainties in shift in consumer p...,Premise,Negative
1234,Exelon Corporation,EXC,2023-11-15,143.41,Buy,EXC is benefiting from labor shortages and str...,Premise,Negative


In [12]:
#df.to_csv("testing.csv")

In [25]:
import pandas as pd
from keybert import KeyBERT

# Initialize KeyBERT model
kw_model = KeyBERT()

# Function to extract keywords or key phrases
def extract_keywords(text, ngram_range=(1, 4), top_n=5):
    keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=ngram_range, stop_words='english', top_n=top_n)
    # Return just the keywords (without similarity scores)
    return [kw for kw in keywords]

In [28]:
# Apply the function to the 'text' column
sentence = """ Q1 Results: Jazz Pharmaceuticals reported strong Q1 results on Thursday, with
revenue of $247 million vs. $196 million in Q1 of FY 13, representing 26% YoY growth.
The GAAP Net Loss was $93 million vs. GAAP Net Income of $43 million in Q1 of FY
13, but $127 million of that was due to an upfront license fee and milestone payment
for JZP-110. Adjusted Net Income was $101 million vs. $84 million in Q1 of FY 13.
These figures were slightly ahead of Goldman Stanley estimates of $240 million in
revenue and $95 million in Adjusted Net Income.
■ FY 14 Guidance: The Company reaffirmed its FY 14 guidance across the board, with
revenue of $1.10 billion – $1.16 billion, Xyrem sales of $755 – $755 million, Erwinaze
at $185 – $200 million, Defitelio at $42 – $52 million, and Adjusted Net Income of
$496 – $520 million. While we believe the market has already priced in these
expectations, we continue to see Jazz as an undervalued, high-growth story going
forward, and we believe that its longer-term revenue, EPS, and EBITDA are likely to
exceed consensus estimates in FY 15, FY 16, and beyond.
■ Catalysts: 1) Possible price increases for Xyrem – Given the historical price increases
and the price ranges for comparable orphan drugs, we believe the company is likely
to announce another round of price increases at the end of FY 14 or early FY 15, and
that there is significant room to grow pricing beyond the current levels. 2) Launch of
new marketing campaigns for Xyrem – Jazz management is in the process of
launching awareness campaigns for narcolepsy patients in key geographies, and has
already reported 11,400 Xyrem patients in Q1, above our FY 14 estimate of ~11,300.
3) Settlement of Roxane lawsuit – We believe this will be decided in Jazz’s favor,
resulting in a delayed entrance for Xyrem generics. Current market expectations
point to generics in FY 19 or FY 20, but we believe FY 21 is more likely (with peak
sales of ~$3.0 billion in FY 20).
■ Our $170.00 target price is based on an FY 14 EV / EBITDA multiple of 20.7x and an FY 15
EV / EBITDA multiple of 15.3x, vs. median peer company multiples of 21.8x and 15.3x,
respectively. Given Jazz’s higher revenue growth, margins, and EBITDA growth, we
believe this is still quite conservative. A DCF analysis with our long-term FCF projections,
a discount rate of 8.07%, and a Terminal FCF growth rate of 0.3% also produces an
implied share price of $168.71."""

key_word = extract_keywords(sentence)

#df['key_phrases'] = df['Commentary'].apply(lambda x: extract_keywords(x))

In [29]:
print(key_word)

[('jazz higher revenue', 0.6734), ('given jazz higher revenue', 0.6675), ('jazz higher revenue growth', 0.6627), ('jazz pharmaceuticals reported strong', 0.6286), ('q1 results jazz pharmaceuticals', 0.6082)]


In [22]:
df

Unnamed: 0,Company,Short Name,Date,Target Price,Recommendation,Commentary,key_phrases
0,Apple Inc.,AAPL,2023-01-15,121.35,Hold,"Despite promising growth in cost management, c...","[(hold aapl, 0.6588), (aapl company, 0.6231), ..."
1,Apple Inc.,AAPL,2023-02-15,371.59,Buy,AAPL is benefiting from investment in research...,"[(aapl benefiting, 0.7087), (aapl, 0.6653), (t..."
2,Apple Inc.,AAPL,2023-03-15,202.52,Hold,AAPL's performance in technology has been stea...,"[(aapl performance, 0.7467), (aapl, 0.6416), (..."
3,Apple Inc.,AAPL,2023-04-15,355.88,Hold,AAPL faces uncertainties in shift in consumer ...,"[(aapl, 0.6197), (target price, 0.5814), (aapl..."
4,Apple Inc.,AAPL,2023-05-15,292.45,Sell,AAPL's challenges in expansion into internatio...,"[(aapl challenges, 0.6315), (aapl, 0.6295), (s..."
...,...,...,...,...,...,...,...
1231,Exelon Corporation,EXC,2023-08-15,242.37,Sell,With ongoing issues like regulatory pressure a...,"[(exc expected, 0.642), (exc, 0.6023), (regula..."
1232,Exelon Corporation,EXC,2023-09-15,248.48,Buy,EXC's continued growth in environmental regula...,"[(exc, 0.5678), (exc continued, 0.5392), (targ..."
1233,Exelon Corporation,EXC,2023-10-15,148.36,Hold,EXC faces uncertainties in shift in consumer p...,"[(exc, 0.5656), (target price, 0.5224), (prosp..."
1234,Exelon Corporation,EXC,2023-11-15,143.41,Buy,EXC is benefiting from labor shortages and str...,"[(exc benefiting, 0.6685), (exc, 0.6284), (tar..."
