In [2]:
import pandas as pd
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

#**Load the Dataset**

In [4]:
df = pd.read_csv('tweets-data.csv')
sample_df = df.sample(n=500, random_state=42).copy()
df

Unnamed: 0.1,Unnamed: 0,Date Created,Number of Likes,Source of Tweet,Tweets,hashtag
0,0,2023-06-25 19:16:20+00:00,0,,@jacksonhinklle #wagner with 6.2 billion dolla...,wagner
1,1,2023-06-25 19:16:18+00:00,0,,Pobrecito es discapacitado\n#Reddetuiterosdemo...,wagner
2,2,2023-06-25 19:16:07+00:00,0,,News from the EIR Daily Alert\n\n“#Putin Addre...,wagner
3,3,2023-06-25 19:15:56+00:00,0,,It's Messi day #Messi𓃵 #Messi36 #Russia #bigst...,wagner
4,4,2023-06-25 19:15:54+00:00,0,,Il passaggio chiave di Machiavelli era questo ...,wagner
...,...,...,...,...,...,...
3005,2,2023-06-25 19:17:59+00:00,0,,"Putting $25,000 dollars into zk $ORANGE, this...",tesla
3006,3,2023-06-25 19:17:50+00:00,0,,"Putting $25,000 dollars into zk $ORANGE, this...",tesla
3007,4,2023-06-25 19:17:33+00:00,0,,"Generational wealth incoming, first #PEPE, now...",tesla
3008,5,2023-06-25 19:17:18+00:00,0,,"On top with $PEPE, now $ORANGE. Airdrop secure...",tesla


#**Clean the Tweets**

In [5]:
def clean_tweet(text):
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower().strip()
    return text

sample_df['cleaned_tweet'] = sample_df['Tweets'].apply(clean_tweet)

#**Define VADER Sentiment Function**

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    score = sia.polarity_scores(text)
    compound = score['compound']
    if compound >= 0.05:
        sentiment = 'Positive'
    elif compound <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return pd.Series([sentiment, compound])


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


#**Apply VADER to Dataset**

In [8]:
sample_df[['sentiment_label', 'sentiment_score']] = sample_df['cleaned_tweet'].apply(get_sentiment)

#**Results**

In [9]:
sample_df.to_csv("sentiment_output.csv", index=False)