<a href="https://colab.research.google.com/github/Rahul-Ag/Sentiment-Analysis-from-tweets-using-BERT/blob/main/Sentiment_Analysis_from_tweets_using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy.stats import mode

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_tweet(tweet):
    """Preprocesses a tweet by cleaning, tokenizing, and lemmatizing."""
    tweet = tweet.lower()  # Convert to lowercase
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)  # Remove URLs
    tweet = re.sub(r'\@\w+|\#', '', tweet)  # Remove mentions and hashtags
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)  # Remove special characters

    words = word_tokenize(tweet)  # Tokenize
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]  # Remove stopwords

    lemmatizer = WordNetLemmatizer()  # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [None]:
df = pd.read_csv('/content/Sentiment Analysis from tweets.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
df.shape

(37249, 2)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [None]:
df.describe()

Unnamed: 0,category
count,37249.0
mean,0.202771
std,0.778515
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [None]:
df.isnull().sum()

Unnamed: 0,0
clean_comment,100
category,0


In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

(37149, 2)

In [None]:
# Preprocess the tweets
df['processed_tweet'] = df['clean_comment'].apply(preprocess_tweet)

In [None]:
df.head()

Unnamed: 0,clean_comment,category,processed_tweet
0,family mormon have never tried explain them t...,1,family mormon never tried explain still stare ...
1,buddhism has very much lot compatible with chr...,1,buddhism much lot compatible christianity espe...
2,seriously don say thing first all they won get...,-1,seriously say thing first get complex explain ...
3,what you have learned yours and only yours wha...,0,learned want teach different focus goal wrappi...
4,for your own benefit you may want read living ...,1,benefit may want read living buddha living chr...


In [None]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X = tfidf_vectorizer.fit_transform(df['processed_tweet']).toarray()
y = df['category']

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
}

In [None]:
# Train classifiers and make predictions
predictions = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy*100:.2f}%")
    print(classification_report(y_test, y_pred))
    predictions[name] = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 84.66%
              precision    recall  f1-score   support

          -1       0.83      0.65      0.73      1597
           0       0.83      0.95      0.89      2654
           1       0.86      0.86      0.86      3179

    accuracy                           0.85      7430
   macro avg       0.84      0.82      0.83      7430
weighted avg       0.85      0.85      0.84      7430

Naive Bayes Accuracy: 65.96%
              precision    recall  f1-score   support

          -1       0.88      0.36      0.51      1597
           0       0.78      0.54      0.64      2654
           1       0.59      0.91      0.71      3179

    accuracy                           0.66      7430
   macro avg       0.75      0.60      0.62      7430
weighted avg       0.72      0.66      0.64      7430

KNN Accuracy: 41.53%
              precision    recall  f1-score   support

          -1       0.71      0.10      0.18      1597
           0       0.38      0.96      0.5

In [None]:
# Since Logistic Regression is giving the highest accuracy, we will use it for our predictions
new_tweets = [
    "The service was exceptional, I loved it!",
    "This is not what I expected, really bad experience.",
    "India Won the T20 World Cup!!! 🎉🏆 The moment we've all been waiting for—India is the T20 World Champion!!! The entire nation is buzzing with joy, pride, and excitement as our team has conquered the world stage with sheer determination and unbeatable spirit! This victory is nothing short of spectacular, a historic triumph that will be celebrated for ages! 🇮🇳🎇",
    "I can't believe the service I just received. How can a company be so careless? 😡 @CustomerCare, you need to fix this ASAP! #Unacceptable #PoorService",
    "Some days just hit harder than others. Missing those who are no longer here. 💔 #Grief #Memories",
    "Just got the best news ever! Feeling on top of the world right now! 🎉✨ #Blessed #Joy",
    "Wow, I did NOT see that coming! Completely blown away by what just happened! 😲 #Unexpected #MindBlown",
    "That was the scariest experience of my life! My heart is still racing! 😰 #Terrified #StillShaking",
    "Big shoutout to @TechGuru for the latest review on the new smartphone! 📱 This might just be my next upgrade! #TechNews #SmartphoneAddict https://example.com/tech",
    "Exploring the best cafes in the city ☕️ Had the most amazing latte at @CaffeineHeaven! Who else is a coffee lover here? #CoffeeCulture #CafeHopping https://example.com/cafes",
    "Just watched the new episode of My Favorite Show! Can't believe that twist at the end 😱 What do you all think? @ShowFanClub #MustWatch #TVAddict https://example.com/show"
]

# Preprocess the new tweets
processed_new_tweets = [preprocess_tweet(tweet) for tweet in new_tweets]

# Transform the tweets using the TF-IDF vectorizer
X_new = tfidf_vectorizer.transform(processed_new_tweets).toarray()

# Use Logistic Regression to predict the sentiment
logistic_regression_classifier = classifiers['Logistic Regression']
logistic_predictions = logistic_regression_classifier.predict(X_new)

# Mapping sentiment values to labels
sentiment_labels = {1: "Positive", 0: "Neutral", -1: "Negative"}

# Output the predictions
for tweet, sentiment in zip(new_tweets, logistic_predictions):
    sentiment_label = sentiment_labels.get(sentiment)
    print(f"Tweet: {tweet}\nPredicted Sentiment: {sentiment_label}\n")

Tweet: The service was exceptional, I loved it!
Predicted Sentiment: Positive

Tweet: This is not what I expected, really bad experience.
Predicted Sentiment: Negative

Tweet: India Won the T20 World Cup!!! 🎉🏆 The moment we've all been waiting for—India is the T20 World Champion!!! The entire nation is buzzing with joy, pride, and excitement as our team has conquered the world stage with sheer determination and unbeatable spirit! This victory is nothing short of spectacular, a historic triumph that will be celebrated for ages! 🇮🇳🎇
Predicted Sentiment: Positive

Tweet: I can't believe the service I just received. How can a company be so careless? 😡 @CustomerCare, you need to fix this ASAP! #Unacceptable #PoorService
Predicted Sentiment: Positive

Tweet: Some days just hit harder than others. Missing those who are no longer here. 💔 #Grief #Memories
Predicted Sentiment: Negative

Tweet: Just got the best news ever! Feeling on top of the world right now! 🎉✨ #Blessed #Joy
Predicted Sentimen