In [1]:
from ntscraper import Nitter
import json
import os
import pandas as pd

# Roberta Pretrained Model
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

#natural language toolkit
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import psycopg2

plt.style.use('ggplot')

# picking a specific model that has been pretrained on data for sentiment analysis
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment" # Define the pre-trained modelt rained specifically on Twitter data
tokenizer = AutoTokenizer.from_pretrained(MODEL) # Load the tokenizer from the pre-trained model to convert text into tokens suitable for the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL) # Load the pre-trained sentiment classification model that will be used to classify text into different sentiment categories

def refresh_twt_aggregated():
    conn = psycopg2.connect("dbname=twt_snt user=postgres password=Ilpmnl!69gg")
    cur = conn.cursor()
    cur.execute("REFRESH MATERIALIZED VIEW twt_aggregated_tbl;")
    conn.commit()
    cur.close()
    conn.close()

# create a function to run our Roberta model on the entire dataset
def polarity_scores_roberta(example):
    # Encode the input text into the appropriate format for the model (PyTorch tensors)
    encoded_text = tokenizer(example, return_tensors='pt')
    # Pass the encoded text through the Roberta model to get sentiment logits (raw model outputs)
    output = model(**encoded_text)
    # Extract the output tensor and convert it to a NumPy array for further processing
    scores = output[0][0].detach().numpy()
    # Apply the softmax function to get normalized probabilities for negative, neutral, and positive sentiments
    # Using probabilities is important for interpretation. The softmax output tells you how likely the model thinks the input text belongs to each of the sentiment categories
    scores = softmax(scores)
    # Create a dictionary to hold the sentiment scores: negative, neutral, and positive
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    
    return scores_dict

# create a function to address funky date format of import tweets
def clean_date(date_str):
    try:
        # Clean up the date string
        cleaned_date_str = date_str[:-4].replace(' · ', ' ')
        # Use pandas to parse the date and convert to a standard format
        dt = pd.to_datetime(cleaned_date_str, format='%b %d, %Y %I:%M %p')
        return dt.strftime('%Y-%m-%d %H:%M:%S')
    except Exception as e:
        print(f"Date parsing error for {date_str}: {e}")
        return None
    

        
def safe_value(value):
    if isinstance(value, dict):
        return json.dumps(value)
    elif isinstance(value, str):
        try:
            # Attempt to load as JSON to ensure it's valid
            json.loads(value)
            return value
        except json.JSONDecodeError:
            # Return the raw string if it's not valid JSON
            return None
    elif isinstance(value, pd.Timestamp):  # Convert pandas datetime to string
        return value.strftime('%Y-%m-%d') if not pd.isnull(value) else None
    elif value is None:  # Explicitly handle None values
        return None
    else:
        return None if pd.isnull(value) else value

        
# Function to capture current time
def print_current_time():
    now = datetime.datetime.now()
    print(f"Current Time: {now.strftime('%Y-%m-%d %H:%M:%S')}")

07-Jun-25 17:47:58 - NumExpr defaulting to 4 threads.




In [2]:
import tweepy
import pandas as pd

# Twitter API credentials
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAJ%2BFvAEAAAAATIyA4cKQKD76g8ve04n4J5Igono%3Dlak30GBoA3qa0ZH8rYLF879i6N2W11OX8fqGxRGFRfRRtdjy0s'  # Replace with your actual bearer token

# Initialize the Tweepy client
client = tweepy.Client(bearer_token=bearer_token)

# List of terms to search for (including top 10 cryptocurrencies and top 10 stocks by volume)
search_terms = [
    
    'SOL', 'KAS', 'LINK', 'ADA', 'MATIC', 'AVAX', 'DOGE', 'BTC', 'ETH', 'POPCAT', 'SUI', 'HNT', 'WIF'
#         'GOOGL','TSLA', 'TSMC', 'CVX', 'COIN', 'NFLX', 'DIS', 'AMZN', 'MSFT', 'AAPL', 'GME', 'NVDA', 'JPM'#  Top stocks by volume plus my interests

]



# Prepare an empty DataFrame to collect all tweets
all_tweets_df = pd.DataFrame()

# Iterate over each term
for term in search_terms:
    query = f'#{term} -is:retweet'  # Exclude retweets with '-is:retweet'

    try:
        # Retrieve tweets
        tweets = client.search_recent_tweets(query=query, max_results=10, tweet_fields=['created_at', 'public_metrics', 'entities', 'referenced_tweets', 'geo', 'lang', 'source'])

        if tweets.data is None:
            print(f"No tweets found for term: {term}")
        else:
            # Extract tweets data into a list of dictionaries
            tweets_data = []
            for tweet in tweets.data:
                created_at = tweet.created_at
                tweet_info = {
                    'term': term,
                    'web_link': f"https://twitter.com/twitter/status/{tweet.id}",
                    'twt_text': tweet.text,
                    'user_name': tweet.author_id,
                    'username': tweet.author_id,
                    'date': created_at.date(),
                    'time': created_at.strftime('%H:%M:%S'),
                    'retweets': tweet.public_metrics.get('retweet_count', 0),
                    'quotes': tweet.public_metrics.get('quote_count', 0),
                    'likes': tweet.public_metrics.get('like_count', 0),
                    'hashtags': ','.join([hashtag['tag'] for hashtag in tweet.entities.get('hashtags', [])]) if tweet.entities else '',
                    'mentions': ','.join([mention['username'] for mention in tweet.entities.get('mentions', [])]) if tweet.entities else '',
                    'urls': ','.join([url['expanded_url'] for url in tweet.entities.get('urls', [])]) if tweet.entities else '',
                    'reply_to_tweet_id': tweet.referenced_tweets[0]['id'] if tweet.referenced_tweets and tweet.referenced_tweets[0]['type'] == 'replied_to' else None,
                    'reply_to_username': None,  # Placeholder for reply_to_username
                    'location': tweet.geo['place_id'] if tweet.geo else None,
                    'twt_language': tweet.lang,
                    'twt_source': tweet.source,
                }
                tweets_data.append(tweet_info)

            # Convert list of dictionaries to DataFrame
            term_df = pd.DataFrame(tweets_data)

            # Append to the main DataFrame
            all_tweets_df = pd.concat([all_tweets_df, term_df], ignore_index=True)

    except tweepy.TweepyException as e:
        print(f"An error occurred while retrieving tweets for term {term}: {e}")

# After collecting all tweets, assign it to raw_df
raw_df = all_tweets_df

# Display the collected DataFrame
# print(raw_df)


In [3]:
print(raw_df)

    term                                           web_link  \
0    SOL  https://twitter.com/twitter/status/19315136381...   
1    SOL  https://twitter.com/twitter/status/19315136323...   
2    SOL  https://twitter.com/twitter/status/19315135544...   
3    SOL  https://twitter.com/twitter/status/19315135492...   
4    SOL  https://twitter.com/twitter/status/19315135449...   
..   ...                                                ...   
125  WIF  https://twitter.com/twitter/status/19315027985...   
126  WIF  https://twitter.com/twitter/status/19315027540...   
127  WIF  https://twitter.com/twitter/status/19314909096...   
128  WIF  https://twitter.com/twitter/status/19314875858...   
129  WIF  https://twitter.com/twitter/status/19314816486...   

                                              twt_text user_name username  \
0            🔗 https://t.co/ahhHVNi0Hd\n#SOL #Outlight      None     None   
1    @RoundtableSpace Happy SUNDAY! ☀️\n\nTime to r...      None     None   
2    #Trump 

#### Sentiment Analyzers

In [4]:
import datetime

# Initialize a SentimentIntensityAnalyzer instance for VADER sentiment analysis
sia = SentimentIntensityAnalyzer()

# Create an empty dictionary to store sentiment results
res = {}

# Iterate over each row in the raw DataFrame using tqdm to provide a progress bar
for i, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    try:
        # Extract the text and unique identifier from the current row
        text = row['twt_text']
        myid = row['web_link']
        
        # Check if the tweet ID (web_link) has already been processed
        # If it exists in `res`, skip to the next iteration to avoid duplicates
        if myid in res:
            continue
        
        # Calculate VADER sentiment scores for the text
        vader_result = sia.polarity_scores(text)
        
        # Rename the keys in VADER results to avoid naming conflicts (e.g., 'vader_neg', 'vader_neu', 'vader_pos')
        vader_result_rename = {f"vader_{key}": value for key, value in vader_result.items()}
        
        # Get sentiment scores using Roberta model
        roberta_result = polarity_scores_roberta(text)
        
        # Combine both VADER and Roberta sentiment results into a single dictionary
        both = {**vader_result_rename, **roberta_result}
        
        # Store the combined results in `res` dictionary using weblink `myid`, as the key
        res[myid] = both
    except RuntimeError:
        # If a RuntimeError occurs (e.g., due to an error in text encoding or other issues), print the ID that caused the problem
        print(f'Broke for id {myid}')

# End of sentiment analysis
print("Sentiment analysis completed.")

# Print the current time to show when the analysis was completed
print_current_time()


  0%|          | 0/130 [00:00<?, ?it/s]

Sentiment analysis completed.
Current Time: 2025-06-07 18:40:24


In [5]:
# Append results of sentiment analysis to create a new DataFrame
# Convert the dictionary `res` (which contains sentiment analysis results) into a DataFrame and transpose it
results_df = pd.DataFrame(res).T
results_df  # Display the resulting DataFrame

# Reset the index of `results_df` and rename the index column to 'web_link'
# This helps to ensure that 'web_link' is treated as a regular column rather than an index
results_df = results_df.reset_index().rename(columns={'index': 'web_link'})
results_df  # Display the updated DataFrame

# Merge the sentiment results (`results_df`) with the original raw DataFrame (`raw_df`)
# The merge is done using a 'right' join, meaning all rows from `raw_df` will be retained,
# and sentiment results will be appended where a match is found by the 'web_link' key
df_submit = results_df.merge(raw_df, how='right')



# results_df.head()
# res
print(df_submit.tail()) 
print(df_submit['date'].max())

                                              web_link  vader_neg  vader_neu  \
125  https://twitter.com/twitter/status/19315027985...      0.070      0.887   
126  https://twitter.com/twitter/status/19315027540...      0.260      0.636   
127  https://twitter.com/twitter/status/19314909096...      0.000      1.000   
128  https://twitter.com/twitter/status/19314875858...      0.036      0.964   
129  https://twitter.com/twitter/status/19314816486...      0.000      1.000   

     vader_pos  vader_compound  roberta_neg  roberta_neu  roberta_pos term  \
125      0.043         -0.2789     0.012002     0.699183     0.288815  WIF   
126      0.104         -0.7506     0.160476     0.729054     0.110470  WIF   
127      0.000          0.0000     0.078001     0.858396     0.063602  WIF   
128      0.000         -0.1027     0.182382     0.639415     0.178203  WIF   
129      0.000          0.0000     0.020538     0.899631     0.079831  WIF   

                                              twt_

#### Insert into DB

In [6]:
import psycopg2
import pandas as pd
import json
from datetime import datetime

# Database connection parameters
db_params = {
    'dbname': 'twt_snt',
    'user': 'postgres',
    'password': 'Ilpmnl!69gg',
    'host': 'localhost',
    'port': '5432'
}

# Function to ensure data types are compatible with SQL
def safe_value(value):
    if isinstance(value, str):
        return value
    elif isinstance(value, pd.Timestamp):  # Convert pandas datetime to string
        return value.strftime('%Y-%m-%d') if not pd.isnull(value) else None
    elif value is None:  # Explicitly handle None values
        return None
    else:
        return None if pd.isnull(value) else value

# Create a connection to the database
conn = psycopg2.connect(**db_params)
cursor = conn.cursor()

# Define the insert query with ON CONFLICT DO NOTHING
insert_query = """
    INSERT INTO twt_tbl (web_link, term, twt_text, user_name, username, date, time, retweets, quotes, likes, hashtags, mentions, urls, reply_to_tweet_id, reply_to_username, location, twt_language, twt_source, vader_neg, vader_neu, vader_pos, vader_compound, roberta_neg, roberta_neu, roberta_pos)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (web_link) DO NOTHING
"""

batch_size = 40
batch_values = []

try:
    df_submit['date'] = pd.to_datetime(df_submit['date']).dt.date  # Ensure date format

    for index, row in df_submit.iterrows():
        # Check if twt_text length exceeds the limit
        if len(row['twt_text']) > 1001:
            print(f"Skipping row {index} due to twt_text length: {len(row['twt_text'])}")
            continue

        # Prepare values for insertion
        values = (
            safe_value(row['web_link']),
            safe_value(row['term']),
            safe_value(row['twt_text']),
            safe_value(row['user_name']),
            safe_value(row['username']),
            safe_value(row['date']),
            safe_value(row['time']),
            safe_value(row['retweets']),
            safe_value(row['quotes']),
            safe_value(row['likes']),
            safe_value(row['hashtags']),
            safe_value(row['mentions']),
            safe_value(row['urls']),
            safe_value(row['reply_to_tweet_id']) if not pd.isna(row['reply_to_tweet_id']) else None,
            safe_value(row['reply_to_username']),
            safe_value(row['location']),
            safe_value(row['twt_language']),
            safe_value(row['twt_source']),
            safe_value(row['vader_neg']),
            safe_value(row['vader_neu']),
            safe_value(row['vader_pos']),
            safe_value(row['vader_compound']),
            safe_value(row['roberta_neg']),
            safe_value(row['roberta_neu']),
            safe_value(row['roberta_pos']),
        )
        
        batch_values.append(values)

        # Commit in batches
        if len(batch_values) >= batch_size:
            try:
                cursor.executemany(insert_query, batch_values)
                conn.commit()
                print(f"Inserted {len(batch_values)} rows at {datetime.now()}")
                batch_values = []  # Clear the batch list
            except Exception as e:
                print(f"Error inserting batch: {e}")
                conn.rollback()

    # Insert remaining rows
    if batch_values:
        try:
            cursor.executemany(insert_query, batch_values)
            conn.commit()
            print(f"Inserted final batch of {len(batch_values)} rows at {datetime.now()}")
        except Exception as e:
            print(f"Error inserting final batch: {e}")
            conn.rollback()

except Exception as e:
    print(f"Error: {e}")

finally:
    # Close the cursor and connection
    cursor.close()
    conn.close()

    
# Call this function after inserting tweets
refresh_twt_aggregated()


Inserted 40 rows at 2025-06-07 18:40:26.980383
Inserted 40 rows at 2025-06-07 18:40:27.057737
Inserted 40 rows at 2025-06-07 18:40:27.128563
Inserted final batch of 10 rows at 2025-06-07 18:40:27.147214


<details>
<summary>Click to expand username tweet pull</summary>

```python
import tweepy
import pandas as pd

# Twitter API credentials
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAJ%2BFvAEAAAAATIyA4cKQKD76g8ve04n4J5Igono%3Dlak30GBoA3qa0ZH8rYLF879i6N2W11OX8fqGxRGFRfRRtdjy0s'  # Replace with your actual bearer token

# Initialize the Tweepy client
client = tweepy.Client(bearer_token=bearer_token)

# List of usernames to search for (example usernames)
usernames = [
    'elonmusk',  # Replace with desired usernames
    'jack'
]

# Prepare an empty DataFrame to collect all tweets
all_tweets_df = pd.DataFrame()

# Iterate over each username
for username in usernames:
    query = f'from:{username} -is:retweet'  # Exclude retweets with '-is:retweet'

    try:
        # Retrieve tweets
        tweets = client.search_recent_tweets(query=query, max_results=10, tweet_fields=['created_at', 'public_metrics', 'entities', 'referenced_tweets', 'geo', 'lang', 'source'])

        if tweets.data is None:
            print(f"No tweets found for username: {username}")
        else:
            # Extract tweets data into a list of dictionaries
            tweets_data = []
            for tweet in tweets.data:
                created_at = tweet.created_at
                tweet_info = {
                    'username': username,
                    'web_link': f"https://twitter.com/twitter/status/{tweet.id}",
                    'twt_text': tweet.text,
                    'user_name': tweet.author_id,
                    'date': created_at.date(),
                    'time': created_at.strftime('%H:%M:%S'),
                    'retweets': tweet.public_metrics.get('retweet_count', 0),
                    'quotes': tweet.public_metrics.get('quote_count', 0),
                    'likes': tweet.public_metrics.get('like_count', 0),
                    'hashtags': ','.join([hashtag['tag'] for hashtag in tweet.entities.get('hashtags', [])]) if tweet.entities else '',
                    'mentions': ','.join([mention['username'] for mention in tweet.entities.get('mentions', [])]) if tweet.entities else '',
                    'urls': ','.join([url['expanded_url'] for url in tweet.entities.get('urls', [])]) if tweet.entities else '',
                    'reply_to_tweet_id': tweet.referenced_tweets[0]['id'] if tweet.referenced_tweets and tweet.referenced_tweets[0]['type'] == 'replied_to' else None,
                    'reply_to_username': None,  # Placeholder for reply_to_username
                    'location': tweet.geo['place_id'] if tweet.geo else None,
                    'twt_language': tweet.lang,
                    'twt_source': tweet.source,
                }
                tweets_data.append(tweet_info)

            # Convert list of dictionaries to DataFrame
            term_df = pd.DataFrame(tweets_data)

            # Append to the main DataFrame
            all_tweets_df = pd.concat([all_tweets_df, term_df], ignore_index=True)

    except tweepy.TweepyException as e:
        print(f"An error occurred while retrieving tweets for username {username}: {e}")

# After collecting all tweets, assign it to raw_df
raw_df = all_tweets_df

# Display the collected DataFrame
print(raw_df)



In [7]:
#import tweepy
#import pandas as pd
#import time

# Twitter API credentials
# bearer_token = 'AAAAAAAAAAAAAAAAAAAAAJ%2BFvAEAAAAATIyA4cKQKD76g8ve04n4J5Igono%3Dlak30GBoA3qa0ZH8rYLF879i6N2W11OX8fqGxRGFRfRRtdjy0s'  # Replace with your actual bearer token

# Initialize the Tweepy client
# client = tweepy.Client(bearer_token=bearer_token)

# List of terms to search for (including top 10 cryptocurrencies and top 10 stocks by volume)
#search_terms = [
#    'BTC', 'ETH', 'BNB', 'AVAX', 'DOGE', 'ADA', 'SOL', 'MATIC', 'LINK', 'KAS',  # Top cryptocurrencies
#    'AAPL', 'TSLA', 'AMZN', 'MSFT', 'GOOGL', 'GME', 'NVDA' #, 'JPM', 'V',  # Top stocks by volume
#    #'DXY'  # Forex
#]

# Prepare an empty DataFrame to collect all tweets
#all_tweets_df = pd.DataFrame()

# Iterate over each term
#for term in search_terms:
#    query = f'#{term} -is:retweet'  # Exclude retweets with '-is:retweet'

#    try:
#        # Retrieve tweets
#        tweets = client.search_recent_tweets(query=query, max_results=20, tweet_fields=['created_at', 'public_metrics', 'entities', 'referenced_tweets', 'geo', 'lang', 'source'])

#        if tweets.data is None:
#            print(f"No tweets found for term: {term}")
#        else:
#            # Extract tweets data into a list of dictionaries
#           tweets_data = []
#            for tweet in tweets.data:
#$                created_at = tweet.created_at
#                tweet_info = {
#                    'term': term,
#                    'web_link': f"https://twitter.com/twitter/status/{tweet.id}",
#                    'twt_text': tweet.text,
#                    'user_name': tweet.author_id,
#                    'username': tweet.author_id,
#                    'date': created_at.date(),
#                    'time': created_at.strftime('%H:%M:%S'),
#                    'retweets': tweet.public_metrics.get('retweet_count', 0),
#                    'quotes': tweet.public_metrics.get('quote_count', 0),
#                    'likes': tweet.public_metrics.get('like_count', 0),
#                    'hashtags': ','.join([hashtag['tag'] for hashtag in tweet.entities.get('hashtags', [])]) if tweet.entities else '',
#                    'mentions': ','.join([mention['username'] for mention in tweet.entities.get('mentions', [])]) if tweet.entities else '',
#                    'urls': ','.join([url['expanded_url'] for url in tweet.entities.get('urls', [])]) if tweet.entities else '',
#                    'reply_to_tweet_id': tweet.referenced_tweets[0]['id'] if tweet.referenced_tweets and tweet.referenced_tweets[0]['type'] == 'replied_to' else None,
#                    'reply_to_username': None,  # Placeholder for reply_to_username
#                    'location': tweet.geo['place_id'] if tweet.geo else None,
#                    'twt_language': tweet.lang,
#                    'twt_source': tweet.source,
#                }
#                tweets_data.append(tweet_info)
#
#           # Convert list of dictionaries to DataFrame
#            term_df = pd.DataFrame(tweets_data)
#
#            # Append to the main DataFrame
#            all_tweets_df = pd.concat([all_tweets_df, term_df], ignore_index=True)#

#        # Delay to avoid hitting rate limits
#        time.sleep(15)  # Adjust the delay as needed

#    except tweepy.TweepyException as e:
#        print(f"An error occurred while retrieving tweets for term {term}: {e}")

# After collecting all tweets, assign it to raw_df
#raw_df = all_tweets_df

# Display the collected DataFrame
# print(raw_df)
