In [2]:
import pandas as pd
import re

# Load the dataset

data = pd.read_csv("dataset.csv")

# Filter only English texts
data = data[data['Language'] == 'en'].reset_index(drop=True)

# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (optional, since they can carry sentiment)
    text = re.sub(r'#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning
data['Cleaned_Text'] = data['Text'].apply(clean_text)

# Display cleaned data
data[['Text', 'Cleaned_Text', 'Label']].head()


Unnamed: 0,Text,Cleaned_Text,Label
0,@Charlie_Corley @Kristine1G @amyklobuchar @Sty...,testimony is NOT evidence in a court of law st...,litigious
1,https://t.co/YJNiO0p1JV Flagstar Bank disclose...,Flagstar Bank discloses a data breach that imp...,litigious
2,Rwanda is set to host the headquarters of Unit...,Rwanda is set to host the headquarters of Unit...,positive
3,OOPS. I typed her name incorrectly (today’s br...,OOPS I typed her name incorrectly todays brave...,litigious
4,It sucks for me since I'm focused on the natur...,It sucks for me since Im focused on the nature...,negative


In [4]:
!pip install nltk




In [5]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# sentiment from VADER
def get_sentiment_vader(text):
    score = sia.polarity_scores(text)
    if score['compound'] >= 0.05:
        return 'Positive'
    elif score['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the sentiment analysis
data['Sentiment_VADER'] = data['Cleaned_Text'].apply(get_sentiment_vader)

# results
data[['Cleaned_Text', 'Sentiment_VADER']].head()



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,Cleaned_Text,Sentiment_VADER
0,testimony is NOT evidence in a court of law st...,Neutral
1,Flagstar Bank discloses a data breach that imp...,Neutral
2,Rwanda is set to host the headquarters of Unit...,Positive
3,OOPS I typed her name incorrectly todays brave...,Positive
4,It sucks for me since Im focused on the nature...,Positive


In [7]:
!pip install transformers torch



In [8]:
from transformers import pipeline

# Initialize the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Apply the transformer model to get sentiment
data['Sentiment_Transformers'] = data['Cleaned_Text'].apply(lambda text: sentiment_pipeline(text)[0]['label'])

# Display the results
data[['Cleaned_Text', 'Sentiment_Transformers']].head()


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

# Assuming you have a 'Date' column
data['Date'] = pd.to_datetime(data['Date'])

# Group by month and calculate sentiment counts
trend_data = data.groupby(data['Date'].dt.to_period('M'))['Sentiment_VADER'].value_counts().unstack().fillna(0)

# Plot sentiment trends over time
trend_data.plot(kind='line', figsize=(10, 6))
plt.title("Sentiment Trends Over Time")
plt.ylabel("Count")
plt.xlabel("Month")
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'Label' is the true sentiment column in your dataset
# Ensure both true labels and predicted labels are in the same format
# Example: If the true labels are in number format, map them to the sentiment labels

# Accuracy for VADER
accuracy_vader = accuracy_score(data['Label'], data['Sentiment_VADER'])
print(f"VADER Accuracy: {accuracy_vader}")

# Detailed classification report for VADER
print("\nVADER Classification Report:")
print(classification_report(data['Label'], data['Sentiment_VADER']))

# Accuracy for Transformers
accuracy_transformers = accuracy_score(data['Label'], data['Sentiment_Transformers'])
print(f"\nTransformers Accuracy: {accuracy_transformers}")

# Detailed classification report for Transformers
print("\nTransformers Classification Report:")
print(classification_report(data['Label'], data['Sentiment_Transformers']))
