##KMeans and TextBlob

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load the CSV file
data = pd.read_csv('tweet_10000_combined.csv', names=['tweet'])

# Extract the tweets from the CSV
tweets = data['tweet']

# Create a TF-IDF vectorizer to convert tweets into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tweets)

# Apply KMeans clustering
num_clusters = 2  # Number of clusters (positive, negative, neutral)
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Get cluster predictions for the tweets
predictions = kmeans.predict(X)

# Store the predictions in an array
predictions_array = np.array(predictions)
print(predictions_array)

In [None]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)

In [None]:
silhouette = silhouette_score(X, predictions)
print(f"Silhouette Score: {silhouette}")

Silhouette Score: 0.05123659608226988


##BERT##


In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score
!pip install transformers
!pip install sentencepiece


Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the CSV file
data = pd.read_csv('tweet_10000_combined.csv',names=['tweet'])

# Extract the tweets from the CSV
tweets = data['tweet'].tolist()

# Load the pre-trained DistilBERT model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Set batch size for processing
batch_size = 16

# Tokenize and perform sentiment analysis in batches
num_tweets = len(tweets)
num_batches = (num_tweets - 1) // batch_size + 1

sentiments = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    batch_tweets = tweets[start_idx:end_idx]

    # Tokenize the batch of tweets
    tokenized_inputs = tokenizer.batch_encode_plus(batch_tweets, padding=True, truncation=True, return_tensors='pt')

    # Perform sentiment analysis on the batch
    outputs = model(**tokenized_inputs)
    predictions = outputs.logits.argmax(dim=1)

    # Collect the predicted sentiment for each tweet in the batch
    sentiments.append(predictions)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.we

In [None]:
sentiment_counts = pd.Series(sentiments).value_counts()
total_tweets = len(sentiments)
print(sentiments)
percentage = sentiment_counts / total_tweets * 100

[tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]), tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1,

##NLTK Tool Kit

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

# Assuming you have a list of tweets called 'tweets'

analyzer = SentimentIntensityAnalyzer()

sentiment_scores = []
for tweet in tweets:
    scores = analyzer.polarity_scores(tweet)
    sentiment_scores.append(scores['compound'])

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
from sklearn.metrics import classification_report
from scipy.stats import pearsonr

In [None]:
correlation = pearsonr(sentiment_scores, predictions_array)
print("Correlation:", correlation)
predicted_labels = [ 0 if score > 0 else 1 if score < 0 else 2 for score in sentiment_scores]
average_score = sum(sentiment_scores) / len(sentiment_scores)
print(f"Average Sentiment Score: {average_score}")

Correlation: PearsonRResult(statistic=0.06588804256472576, pvalue=1.7255786360872644e-10)
Average Sentiment Score: 0.05736499466382096


##GPT

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the CSV file
data = pd.read_csv('tweet_10000_combined.csv', names=['tweet'])

# Extract the tweets from the CSV
tweets = data['tweet'].tolist()

# Load the pre-trained GPT model and tokenizer
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForSequenceClassification.from_pretrained(model_name)
predicted_classarray = []

# Perform sentiment analysis for each tweet
for tweet in tweets:
    # Tokenize the tweet
    inputs = tokenizer.encode_plus(tweet, return_tensors='pt', padding='longest', truncation=True, max_length=128)

    # Generate sentiment from GPT model
    if inputs['input_ids'].size()[1] > 0:
        outputs = model(**inputs)
        predicted_class = outputs.logits.argmax(dim=1)
        predicted_classarray.append(predicted_class)
        # Print the generated sentiment for the tweet
        print(f"Tweet: {tweet}\nSentiment: {predicted_class.item()}\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Sentiment: 1

Tweet: applesplz eyes emoji
Sentiment: 1

Tweet: RT luiiza_marra Oi Mark Zuckerberg, coloca esse emoji no wpp eu imploro
Sentiment: 0

Tweet: RT m3lissacx read the first letter of each emoji
Sentiment: 0

Tweet: melseacheekins You re the sweetest bean also upset there s no bean emoji
Sentiment: 1

Tweet: RT ansiiaviva es mi emoji favorito y ni siquiera existe
Sentiment: 0

Tweet: RT KyunnieKitty Kiss, Marry, Friendzone I got Minkyun, Yoseob, ampDongwoo. Kiss Dongwoo charismatic.period Marry Minkyun anything
Sentiment: 1

Tweet: LaraMexico3 No hay emoji que describa mi cara de terror, acci n de levantarme violentamente y Mentir... DIOS, dej la plancha prendida
Sentiment: 0

Tweet: RT bIondiewasabi read the first letter of each emoji
Sentiment: 0

Tweet: RT 55mmbae Quavo slid in Saweetie DMs and said You an icy girl, you need a glacier boy. With a snowflake emoji
Sentiment: 1

Tweet: shmeaglesnorger GovAndyBes

In [None]:
average_score = sum(predicted_classarray) / len(predicted_classarray)
average_score = round(average_score.item(), 4)
print(f"Average Sentiment Score: {average_score}")

Average Sentiment Score: 0.292
