In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import re
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from uuid import uuid4

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
folder = os.path.expanduser('~/Desktop/Machine Learning/data_ML_Project')
file_path = os.path.join(folder, 'earnings_calls.parquet')  # No extension

if os.path.exists(file_path):
    data = pd.read_parquet(file_path)
else:
    print("File still not found!")

In [3]:
data.head()

Unnamed: 0,transcriptid,componentorder,componenttext,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename
0,3285682,0,"Good morning, everyone, and welcome to the LXI...",2020-11-23,323562,LXI REIT plc,Presentation Operator Message
1,50630,5,"Thanks, Bob. In summary 2009 was a pivotal yea...",2010-02-18,63083,Endologix LLC,Presenter Speech
2,840499,103,"Just one thing left to ask, actually. If we ta...",2015-07-28,2410,BP p.l.c.,Question
3,47300,58,"I said, I think it certainly has been a rare o...",2010-01-27,9317,SEI Investments Company,Answer
4,49778,1,<strong>Operator</strong>\nLadies and gentleme...,2010-02-12,176660,3SBio Inc.,Presentation Section


In [4]:
data.sort_values(by='mostimportantdateutc',inplace=True)

data.sort_values(by=['mostimportantdateutc', 'componentorder'], ascending=[False, True],inplace=True)

data

Unnamed: 0,transcriptid,componentorder,componenttext,mostimportantdateutc,gvkey,companyname,transcriptcomponenttypename
88038,2164922,0,"Ladies and gentlemen, thank you for standing b...",2020-12-30,161925,China Finance Online Co. Limited,Presentation Operator Message
958919,2164922,1,"Thank you. Thank you, operator. Welcome to Chi...",2020-12-30,161925,China Finance Online Co. Limited,Presenter Speech
958914,2164922,2,[Foreign Language],2020-12-30,161925,China Finance Online Co. Limited,Presenter Speech
958909,2164922,3,"Good morning, and good evening. Thank you for ...",2020-12-30,161925,China Finance Online Co. Limited,Presenter Speech
958904,2164922,4,[Foreign Language],2020-12-30,161925,China Finance Online Co. Limited,Presenter Speech
...,...,...,...,...,...,...,...
4581352,46682,84,And then I did want to ask Tom about Cisco. Wh...,2010-01-04,030247,Viasystems Corporation,Question
41828,46682,85,We don't typically comment on any specific cus...,2010-01-04,030247,Viasystems Corporation,Answer
4020336,46682,86,"And Mr. Burger, we have no further questions.",2010-01-04,030247,Viasystems Corporation,Question and Answer Operator Message
41822,46682,87,"Well, thank you, everyone for attending, we ap...",2010-01-04,030247,Viasystems Corporation,Answer


In [5]:
# Group and merge componenttext by transcriptid
grouped_data = data.groupby('transcriptid')['componenttext'].apply(lambda texts: ' '.join(texts)).reset_index()

# Rename the column properly on the new DataFrame
grouped_data.rename(columns={'componenttext': 'full_transcript'}, inplace=True)


In [6]:
grouped_data

Unnamed: 0,transcriptid,full_transcript
0,15674,"Good morning, and welcome to Pactiv’s fourth q..."
1,45053,"Ladies and gentlemen, thank you for standing b..."
2,45107,"Ladies and gentlemen, thank you for standing b..."
3,45200,"Good morning, my name is Kim, and I will be yo..."
4,45212,"Good morning, ladies and gentlemen, and welcom..."
...,...,...
215308,3328351,Good afternoon. My name is Tom and I will be y...
215309,3328562,"Good morning. My name is Dorothy, and I will b..."
215310,3341730,Greetings and welcome to miRagen Therapeutics ...
215311,3342207,"Good morning, everyone, and welcome to Encompa..."


In [7]:
# Charger le modèle et le tokenizer FinBERT
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")



def finbert_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment = torch.argmax(probs).item()
    labels = ['negative', 'neutral', 'positive']
    return labels[sentiment], probs.tolist()[0]


In [8]:
#découpe le texte en petit morceau et retourne le sentiment majoritaire du earning call
def finbert_sentiment_long(text, chunk_size=500):
    tokens = tokenizer.tokenize(text)
    chunks = [' '.join(tokens[i:i+chunk_size]) for i in range(0, len(tokens), chunk_size)]
    sentiments = []
    for chunk in chunks:
        chunk_text = tokenizer.convert_tokens_to_string(chunk.split())
        label, _ = finbert_sentiment(chunk_text)
        sentiments.append(label)
    # Retourner le sentiment majoritaire ou une agrégation
    sentiment = max(set(sentiments), key=sentiments.count)
    return sentiment, sentiments

subset = grouped_data.head(10).copy()

subset['sentiment'], subset['all_chunk_sentiments'] = zip(*subset['full_transcript'].apply(finbert_sentiment_long))

print(subset[['transcriptid', 'sentiment', 'all_chunk_sentiments']])


#DATA COMPLET
#grouped_data['sentiment'], grouped_data['all_chunk_sentiments'] = zip(*grouped_data['full_transcript'].apply(finbert_sentiment_long))


   transcriptid sentiment                               all_chunk_sentiments
0         15674  negative  [neutral, neutral, neutral, neutral, negative,...
1         45053  negative  [negative, neutral, neutral, neutral, negative...
2         45107   neutral  [negative, neutral, neutral, neutral, neutral,...
3         45200   neutral  [negative, neutral, neutral, neutral, neutral,...
4         45212  negative  [negative, positive, positive, neutral, neutra...
5         45220   neutral  [neutral, neutral, neutral, neutral, negative,...
6         45248  negative  [negative, neutral, negative, neutral, neutral...
7         45259   neutral  [negative, positive, neutral, neutral, neutral...
8         45284  negative  [negative, neutral, neutral, neutral, positive...
9         45291  negative  [negative, neutral, neutral, negative, negativ...
