In [1]:
import os
import pandas as pd
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch
import hopsworks
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = GPT2TokenizerFast.from_pretrained('Xenova/text-embedding-ada-002')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT4Tokenizer'. 
The class this function is called from is 'GPT2TokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Functions

In [3]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def get_embedding(dataset, embedding_object):
    embeddings = []
    for data in dataset["text"]:
        embedded_text = embedding_object.encode(data)
        embeddings.append(embedded_text)

    dataset_embedded = dataset.copy()
    dataset_embedded["embeddings"] = embeddings
    dataset_embedded = dataset_embedded.drop(columns=["text"])
    return dataset_embedded

In [21]:
financial_phrase_bank_df = load_data(os.path.join("base-data", "FinancialPhraseBank", "all-data-75-above.csv"))
zeroshot_train_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_train.csv"))
zeroshot_test_df = load_data(os.path.join("base-data", "twitter-financial-news-sentiment", "sent_test.csv"))

In [28]:
financial_phrase_bank_df

Unnamed: 0,Text,Label
0,"According to Gran , the company has no plans t...",2
1,With the new production plant the company woul...,1
2,"For the last quarter of 2010 , Componenta 's n...",1
3,"In the third quarter of 2010 , net sales incre...",1
4,Operating profit rose to EUR 13.1 mn from EUR ...,1
...,...,...
3448,Operating result for the 12-month period decre...,0
3449,HELSINKI Thomson Financial - Shares in Cargote...,0
3450,LONDON MarketWatch -- Share prices ended lower...,0
3451,Operating profit fell to EUR 35.4 mn from EUR ...,0


In [36]:
# Create a new DataFrame with the column name changed
financial_phrase_bank_df1 = financial_phrase_bank_df.rename(columns={'Text': 'text'})
financial_phrase_bank_df1 = financial_phrase_bank_df.rename(columns={'Label':'label'})
financial_phrase_bank_df1



Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",2
1,With the new production plant the company woul...,1
2,"For the last quarter of 2010 , Componenta 's n...",1
3,"In the third quarter of 2010 , net sales incre...",1
4,Operating profit rose to EUR 13.1 mn from EUR ...,1
...,...,...
3448,Operating result for the 12-month period decre...,0
3449,HELSINKI Thomson Financial - Shares in Cargote...,0
3450,LONDON MarketWatch -- Share prices ended lower...,0
3451,Operating profit fell to EUR 35.4 mn from EUR ...,0


In [24]:
zeroshot_train_df

Unnamed: 0,text,label
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0
...,...,...
9538,The Week's Gainers and Losers on the Stoxx Eur...,2
9539,Tupperware Brands among consumer gainers; Unil...,2
9540,vTv Therapeutics leads healthcare gainers; Myo...,2
9541,"WORK, XPO, PYX and AMKR among after hour movers",2


In [25]:
zeroshot_test_df

Unnamed: 0,text,label
0,$ALLY - Ally Financial pulls outlook https://t...,0
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0
2,$PRTY - Moody's turns negative on Party City h...,0
3,$SAN: Deutsche Bank cuts to Hold,0
4,$SITC: Compass Point cuts to Sell,0
...,...,...
2383,Stocks making the biggest moves midday: TD Ame...,2
2384,Stocks making the biggest moves premarket: Fit...,2
2385,Stocks making the biggest moves premarket: Hom...,2
2386,Stocks making the biggest moves premarket: TD ...,2


In [59]:

# Assuming df1, df2, df3 are your dataframes
df = pd.concat([financial_phrase_bank_df1, zeroshot_train_df, zeroshot_test_df], ignore_index=True)

# Get the count of each label
label_counts = df['label'].value_counts()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, stratify=df['label'])

# Now, X_train and y_train contain the training data and their corresponding labels
# X_test and y_test contain the test data and their corresponding labels

## Sanity check of label distribution

In [60]:
y_train_df = pd.DataFrame(y_train)
train_value_counts= y_train_df.value_counts()
# Get the percentage of each label in the training data
print(train_value_counts / train_value_counts.sum())

y_test_df = pd.DataFrame(y_test)
test_value_counts= y_test_df.value_counts()
print(test_value_counts / test_value_counts.sum())


label
2        0.642886
1        0.213537
0        0.143577
Name: count, dtype: float64
label
2        0.642834
1        0.213520
0        0.143646
Name: count, dtype: float64


In [61]:
train_dataset_df = pd.concat([X_train, y_train], axis=1)
test_dataset_df = pd.concat([X_test, y_test], axis=1)

train_dataset_df_embedded = get_embedding(train_dataset_df, tokenizer)
test_dataset_df_embedded = get_embedding(test_dataset_df, tokenizer)

train_dataset_df_embedded

Unnamed: 0,label,embeddings
10731,2,"[43, 309, 311, 5666, 43526, 11, 5734, 64239, 3..."
13278,2,"[24055, 1854, 53133, 753, 38493, 2057, 3580, 7..."
516,1,"[34160, 311, 11295, 14800, 1174, 279, 3157, 44..."
7939,2,"[3915, 49884, 311, 16759, 11, 5220, 323, 9919,..."
6733,0,"[33, 2152, 41707, 43394, 555, 400, 15, 13, 172..."
...,...,...
4998,2,"[8586, 596, 2650, 16795, 6972, 82, 220, 1591, ..."
5267,2,"[697, 3141, 564, 29782, 6136, 6108, 9327, 449,..."
5967,2,"[3, 17001, 48230, 482, 42828, 30981, 5856, 11,..."
8825,2,"[25554, 3109, 38239, 38074, 12, 777, 63052, 71..."


In [62]:
hopsworks_project = hopsworks.login() 
fs = hopsworks_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/546965
Connected. Call `.close()` to terminate connection gracefully.


In [63]:
fg_train = fs.get_or_create_feature_group(name="news_sentiment_traindata", version=1, description="Training data and labels for financial news sentiment prediction model", primary_key=["label", "embeddings"], online_enabled=True)
fg_train.insert(train_dataset_df_embedded)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/546965/fs/542788/fg/605557


Uploading Dataframe: 100.00% |██████████| Rows 12307/12307 | Elapsed Time: 00:17 | Remaining Time: 00:00


Launching job: news_sentiment_traindata_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/546965/jobs/named/news_sentiment_traindata_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x28746ebfdf0>, None)

In [64]:
fg_test = fs.get_or_create_feature_group(name="news_sentiment_testdata", version=1, description="Test data and labels for financial news sentiment prediction model", primary_key=["label", "embeddings"], online_enabled=True)
fg_test.insert(test_dataset_df_embedded)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/546965/fs/542788/fg/606573


Uploading Dataframe: 100.00% |██████████| Rows 3077/3077 | Elapsed Time: 00:13 | Remaining Time: 00:00


Launching job: news_sentiment_testdata_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/546965/jobs/named/news_sentiment_testdata_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x28746ecec70>, None)