<a href="https://colab.research.google.com/github/Nidhinbc97/ML-AI/blob/main/Emotion_Detection_in_Twitter_Data_using_NLP_Techniques__2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
corpus = ['Data Science is an overlap between Arts and Science',
          'Generally, Arts graduates are right-brained and Science fraduates are left-brained',
          'Excelling in both Arts and Science at a time becomes difficult',
          'Natural Language Processing is a part of Data Science']

In [5]:
def vec_text(corpus):
  bow_model = CountVectorizer()
  dense_vec = bow_model.fit_transform(corpus).todense()
  bow_df = pd.DataFrame(dense_vec)
  bow_df.columns = sorted( bow_model.vocabulary_)
  return(bow_df)

In [6]:
df = vec_text(corpus)

In [7]:
df

Unnamed: 0,an,and,are,arts,at,becomes,between,both,brained,data,...,language,left,natural,of,overlap,part,processing,right,science,time
0,1,1,0,1,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,2,0
1,0,1,2,1,0,0,0,0,2,0,...,0,1,0,0,0,0,0,1,1,0
2,0,1,0,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,0,1,...,1,0,1,1,0,1,1,0,1,0


In [8]:
df = pd.read_csv('/content/tweet_emotions.csv')

In [None]:
df.head()

In [9]:
print(df.columns)

Index(['tweet_id', 'sentiment', 'content'], dtype='object')


In [11]:
# Define a function for text preprocessing
def preprocess_text(content):
    # Lowercasing
    text = content.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply the preprocessing to the 'reviewText' column
df['processed_text'] = df['content'].apply(preprocess_text)

In [12]:
# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to categorize sentiment
def get_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Analyze sentiments and create a new sentiment column
df['sentiment'] = df['processed_text'].apply(lambda x: get_sentiment(sia.polarity_scores(x)['compound']))

In [13]:
# Split the data into training and testing sets
X = df['processed_text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Convert text data to TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
# Initialize Naive Bayes classifier
naive_bayes = MultinomialNB()

# Define hyperparameters grid for Naive Bayes
param_grid_nb = {
    'alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
}

# Initialize GridSearchCV for Naive Bayes
grid_search_nb = GridSearchCV(naive_bayes, param_grid_nb, cv=2, scoring='accuracy', n_jobs=-1)

In [16]:
# Train the Naive Bayes model with hyperparameter tuning
grid_search_nb.fit(X_train_tfidf, y_train)

In [17]:
# Get the best parameters and estimator for Naive Bayes
best_params_nb = grid_search_nb.best_params_
best_model_nb = grid_search_nb.best_estimator_

# Print best parameters for Naive Bayes
print("Best Parameters for Naive Bayes:", best_params_nb)

Best Parameters for Naive Bayes: {'alpha': 0.1}


In [18]:
y_pred_nb = best_model_nb.predict(X_test_tfidf)

In [19]:
# Evaluate the Naive Bayes model
print("Accuracy for Naive Bayes:", accuracy_score(y_test, y_pred_nb))
print("Classification Report for Naive Bayes:")
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix for Naive Bayes:")
print(confusion_matrix(y_test, y_pred_nb))

Accuracy for Naive Bayes: 0.67325
Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.72      0.54      0.61      1960
     neutral       0.79      0.35      0.49      2099
    positive       0.64      0.91      0.75      3941

    accuracy                           0.67      8000
   macro avg       0.72      0.60      0.62      8000
weighted avg       0.70      0.67      0.65      8000

Confusion Matrix for Naive Bayes:
[[1051   86  823]
 [ 180  742 1177]
 [ 234  114 3593]]


In [20]:
# Load the pre-trained model and tokenizer
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_emotion(text):
    # Tokenize the input text and convert to tensors
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get the predicted class
    predicted_class = torch.argmax(probs, dim=1).item()

    return predicted_class, probs

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [21]:
# Apply the emotion prediction function to the DataFrame
df['emotion'], df['probabilities'] = zip(*df['processed_text'].apply(predict_emotion))

In [22]:
# Map predicted class index to emotion labels (assuming these are the labels used by the model)
emotion_labels = ["anger", "fear", "joy", "sadness", "surprise"]
df['most_likely_emotion'] = df['emotion'].apply(lambda x: emotion_labels[x])
df['emotion_probabilities'] = df['probabilities'].apply(lambda x: x.tolist())

In [None]:
df['emotion'], df['probabilities'] = zip(*df['processed_text'].apply(predict_emotion))

In [None]:
df_train = pd.read_csv('tweet_emotions.csv',delimiter=';',names = ['content', 'sentiment','processed_reviews'])
df_val = pd.read_csv('tweet_emotions .csv',delimiter=';',names = ['content', 'sentiment','processed_reviews'])

In [None]:
pd.set_option('display.max_colwidth',None)
df.head()

In [None]:
data = pd.concat([df_train,df_val])

In [None]:
df_train.shape

In [None]:
df.shape

In [None]:
df[15995:16003]

In [None]:
df.reset_index(inplace=True,drop=True)

In [None]:
df[15995:16003]

In [None]:
df.shape

In [None]:
df.tail()

In [None]:
data['label'].unique()

In [None]:
data['label'].value_counts()

In [None]:
def custom_encoder(df):
  df.replace(to_replace=['surprise', 'joy'],value =1 , inplace=True)
  df.replace(to_replace=['sadness', 'anger', 'fear'],value =1 , inplace=True)
  return df

In [None]:
data['label'] = custom_encoder(data['label'])

In [None]:
data.head()

In [None]:
data.label.value_counts(normalize=True)

Data processing

In [None]:
import string
def remove_punctuations(text):
  punc_free = ''.join([i for i in text if i not in string.punctuation])
  return punc_free

In [None]:
import nltk
nltk.download('punkt')
def tokenize(text):
  words = nltk.word_tokenize(text)
  return words

In [None]:
nltk.download('stopwords')
sw = nltk.corpus.stopwords.words('english')
def remove_sw(text):
  output = [i for i in text if i not in sw]
  return output

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
word_lem = WordNetLemmatizer()
def lemm(text):
  lemm_text = [word_lem.lemmatize(word) for word in text]
  return lemm_text



In [None]:
def preprocess(df_col):
  corpus=[]
  for item in df_col:
    new_item = remove_punctuations(item)
    new_item = new_item.lower()
    new_item = tokenize(new_item)
    new_item = remove_sw(new_item)
    new_item = lemm(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [None]:
corpus = preprocess(data['text'])

In [None]:
corpus

Bag_of_Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
vec_data = cv.fit_transform(corpus)
x = vec_data
y = data['label']

In [None]:
x

In [None]:
y

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
!pip install scikit-learn

In [None]:
clf = RandomForestCalssifier(n_estimators = 100)
clf.fit(x,y)