In [1]:
import pandas as pd
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from  sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
from datasets import load_dataset
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")
ds['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 9543
})

In [3]:
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['validation'])

In [6]:
# Naive bayes classifier

X_train = df_train['text']
y_train = df_train['label']
X_test = df_test['text']
y_test = df_test['label']

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


gnb = GaussianNB()
gnb.fit(X_train.toarray(), y_train)

test_tweet = "Oppenheimer cuts estimates on Yum China"
print(gnb.predict(vectorizer.transform([test_tweet]).toarray()))
predicted = gnb.predict(X_test.toarray())
print("Naive Bayes")
print(classification_report(y_test, predicted))


[0]
Naive Bayes
              precision    recall  f1-score   support

           0       0.33      0.58      0.42       347
           1       0.46      0.41      0.43       475
           2       0.79      0.68      0.73      1566

    accuracy                           0.61      2388
   macro avg       0.53      0.56      0.53      2388
weighted avg       0.66      0.61      0.63      2388



In [7]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
predicted = lr.predict(X_test)

test_tweet = "$BTC is going to the moon"
print(lr.predict(vectorizer.transform([test_tweet]).toarray()))

print("Logistic Regression")
print(classification_report(y_test, predicted))


[2]
Logistic Regression
              precision    recall  f1-score   support

           0       0.73      0.58      0.65       347
           1       0.78      0.66      0.72       475
           2       0.86      0.93      0.89      1566

    accuracy                           0.83      2388
   macro avg       0.79      0.73      0.75      2388
weighted avg       0.82      0.83      0.82      2388



In [8]:
df = pd.read_parquet("hf://datasets/TimKoornstra/financial-tweets-sentiment/data/train-00000-of-00001.parquet")

In [64]:
df.head()

Unnamed: 0,tweet,sentiment,url
0,$BYND - JPMorgan reels in expectations on Beyo...,2,https://huggingface.co/datasets/zeroshot/twitt...
1,$CCL $RCL - Nomura points to bookings weakness...,2,https://huggingface.co/datasets/zeroshot/twitt...
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",2,https://huggingface.co/datasets/zeroshot/twitt...
3,$ESS: BTIG Research cuts to Neutral https://t....,2,https://huggingface.co/datasets/zeroshot/twitt...
4,$FNKO - Funko slides after Piper Jaffray PT cu...,2,https://huggingface.co/datasets/zeroshot/twitt...


In [9]:
if 'url' in df:
    df.drop(columns=['url'])
df.rename(columns={'sentiment': 'label'}, inplace=True)
df.rename(columns={'tweet': 'text'}, inplace=True)
map_labels = {0: "neutral", 1: 'positive',  2: 'negative'}
df['label'] = df['label'].apply(lambda x: map_labels[x])
df.head()

Unnamed: 0,text,label,url
0,$BYND - JPMorgan reels in expectations on Beyo...,negative,https://huggingface.co/datasets/zeroshot/twitt...
1,$CCL $RCL - Nomura points to bookings weakness...,negative,https://huggingface.co/datasets/zeroshot/twitt...
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",negative,https://huggingface.co/datasets/zeroshot/twitt...
3,$ESS: BTIG Research cuts to Neutral https://t....,negative,https://huggingface.co/datasets/zeroshot/twitt...
4,$FNKO - Funko slides after Piper Jaffray PT cu...,negative,https://huggingface.co/datasets/zeroshot/twitt...


In [10]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42)
#df_test = pd.DataFrame(df_no_url['validation'])

X_train = df_train['text']
y_train = df_train['label']
X_test = df_test['text']
y_test = df_test['label']

vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [11]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
predicted = lr.predict(X_test)

test_tweet = ""
print(lr.predict(vectorizer.transform([test_tweet]).toarray()))

print("Logistic Regression")
print(classification_report(y_test, predicted))

['neutral']
Logistic Regression
              precision    recall  f1-score   support

    negative       0.73      0.62      0.67       895
     neutral       0.74      0.74      0.74      1225
    positive       0.76      0.82      0.79      1690

    accuracy                           0.75      3810
   macro avg       0.74      0.73      0.73      3810
weighted avg       0.75      0.75      0.74      3810



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
import time
import re
from tqdm.notebook import tqdm

# Create a list to store interactions data
interactions = []

# Sample a limited number of tweets to avoid excessive API calls
sample_size = min(100, len(df))
sample_df = df.sample(sample_size, random_state=42)

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    tweet_text = row['text']
    
    try:
        # Extract meaningful keywords for search
        cleaned_text = re.sub(r'http\S+|@\S+|#\S+', '', tweet_text)
        # Get significant words for search
        keywords = [word for word in cleaned_text.split() if len(word) >= 4]
        search_query = ' '.join(keywords[:5]) if keywords else tweet_text[:100]
        
        # Search for the tweet
        search_results = client.search_recent_tweets(
            query=search_query,
            max_results=5,
            tweet_fields=['public_metrics', 'created_at']
        )
        
        if hasattr(search_results, 'data') and search_results.data:
            tweet = search_results.data[0]
            
            # Store tweet data and metrics
            metrics = {
                'tweet_id': tweet.id,
                'created_at': tweet.created_at,
                'retrieved_text': tweet.text,
                'original_text': tweet_text,
                'sentiment': row['label'],
                'found': True
            }
            
            # Get interaction metrics
            if hasattr(tweet, 'public_metrics'):
                metrics.update({
                    'like_count': tweet.public_metrics.get('like_count', 0),
                    'retweet_count': tweet.public_metrics.get('retweet_count', 0),
                    'reply_count': tweet.public_metrics.get('reply_count', 0),
                    'quote_count': tweet.public_metrics.get('quote_count', 0)
                })
            
            interactions.append(metrics)
        else:
            # Tweet not found case
            interactions.append({
                'tweet_id': None,
                'original_text': tweet_text,
                'sentiment': row['label'],
                'found': False
            })
    except Exception as e:
        print(f"Error processing tweet {idx}: {e}")
        interactions.append({
            'tweet_id': None,
            'original_text': tweet_text,
            'sentiment': row['label'],
            'error': str(e),
            'found': False
        })
    
    # Respect API rate limits
    time.sleep(2)

# Create dataframe with interactions data
interactions_df = pd.DataFrame(interactions)
print(f"Found {interactions_df['found'].sum()} tweets out of {sample_size} searched")
interactions_df.head()

  0%|          | 0/100 [00:00<?, ?it/s]

Error processing tweet 30503: name 'client' is not defined
Error processing tweet 15699: name 'client' is not defined
Error processing tweet 17258: name 'client' is not defined
Error processing tweet 12426: name 'client' is not defined
Error processing tweet 10063: name 'client' is not defined
Error processing tweet 28445: name 'client' is not defined
Error processing tweet 2116: name 'client' is not defined
Error processing tweet 26743: name 'client' is not defined
Error processing tweet 35316: name 'client' is not defined
Error processing tweet 25024: name 'client' is not defined
Error processing tweet 35482: name 'client' is not defined
Error processing tweet 35033: name 'client' is not defined
Error processing tweet 28685: name 'client' is not defined
Error processing tweet 4943: name 'client' is not defined
Error processing tweet 273: name 'client' is not defined
Error processing tweet 5640: name 'client' is not defined
Error processing tweet 198: name 'client' is not defined
Erro

Unnamed: 0,tweet_id,original_text,sentiment,error,found
0,,RT @OTC_ARMY: #PressRealease https://t.co/7w6F...,neutral,name 'client' is not defined,False
1,,"Green Daily Triangle on TC,.....Open ong at 3.37",positive,name 'client' is not defined,False
2,,The U.S.â€™s national medical stockpile has se...,positive,name 'client' is not defined,False
3,,AAP ... I don't see any more than 1 up 3min ca...,negative,name 'client' is not defined,False
4,,AT&T takes on $5.5 billion loan to boost 'fina...,neutral,name 'client' is not defined,False
