In [None]:
import pandas as pd
from textblob import TextBlob
from pathlib import Path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Read in the data frame

In [None]:
#read in file
file_path = Path("../data/cleaned_data/cleaned_training_tweets.csv")

df = pd.read_csv(file_path, encoding='ISO-8859-1')
df.head()

In [None]:
df.info()

## Copy the DataFrame to test both TextBlob and VaderSentiment

In [None]:
original_df = df.copy()

In [None]:
text_blob_df = df.copy()

In [None]:
#drop the target column
text_blob_df = text_blob_df.drop(columns=['target'])
text_blob_df.head()

In [None]:
vader_sentiment_df = df.copy()

In [None]:
#drop the target column
vader_sentiment_df = vader_sentiment_df.drop(columns=['target'])
vader_sentiment_df.head()

## Run original data set through Bag of Words Vectorization Based Model

In [None]:
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(original_df['cleaned_text'])

In [None]:
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, original_df['target'], test_size=0.25, random_state=5)

In [None]:
#Training the model
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
from sklearn import metrics

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

In [None]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

## TextBlob Sentiment analysis

In [None]:
# Define a function to perform sentiment analysis using TextBlob and convert to labels
def analyze_sentiment_blob(text):
    if isinstance(text, float):
        return 'neutral'
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity



# Apply the function to the 'text' column and create a new column 'sentiment_label'
text_blob_df['textblob_sentiment'] = text_blob_df['cleaned_text'].apply(analyze_sentiment_blob)
text_blob_df.head()

In [None]:
# Define a function to perform sentiment analysis using TextBlob and convert to labels
def analyze_sentiment_blob(text):
    if isinstance(text, float):
        return 'neutral'
    analysis = TextBlob(str(text))
    polarity = analysis.sentiment.polarity
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'cleaned_text' column and create a new column 'textblob_sentiment'
text_blob_df['textblob_sentiment_lables'] = text_blob_df['cleaned_text'].apply(analyze_sentiment_blob)

text_blob_df.head()

In [None]:
text_blob_df.value_counts('textblob_sentiment_lables')

In [None]:
# specify where to place cleaned data and what to call it
cleaned_file_path = '../data/cleaned_data/text_blob_sentiment.csv'

#save data using the cleaned file path.
text_blob_df.to_csv(cleaned_file_path, index=False)

In [None]:
textblob_df = pd.read_csv('../data/cleaned_data/text_blob_sentiment.csv')

### Now run textBlob through the Bag of Words vectorization model

In [None]:
#fill any missing values with an empty str
textblob_df['cleaned_text'] = textblob_df['cleaned_text'].fillna('')

In [None]:
#Pre-Process the data using bag of words Vectorization-Based Model
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(textblob_df['cleaned_text'])

In [None]:
#Splitting the data into trainig and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, textblob_df['textblob_sentiment_lables'], test_size=0.25, random_state=5)

In [None]:
#train the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
#make predictions
predicted = MNB.predict(X_test)


In [None]:
# Calculate accuracy
from sklearn import metrics
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

In [None]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

## VaderSentiment analysis

In [None]:
sentiment = SentimentIntensityAnalyzer()
#create a function to apply to our data frame
def analyze_sentiment_vader (text):    
    sentiment_score = sentiment.polarity_scores(text)
    return sentiment_score['compound']
#apply the created function to the copied vader_sentiment_df data frame    
vader_sentiment_df['sentiment_vader'] = vader_sentiment_df['cleaned_text'].apply(analyze_sentiment_vader)
vader_sentiment_df.head()

In [None]:
# Define a function to convert sentiment scores to labels
def sentiment_to_label(score):
    if score >= 0.1:
        return 'positive'
    elif score <= -0.1:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'sentiment_vader' column and create a new column 'sentiment_label'
vader_sentiment_df['vader_sentiment_label'] = vader_sentiment_df['sentiment_vader'].apply(sentiment_to_label)

# Display the DataFrame with sentiment labels
vader_sentiment_df.head()

In [None]:
vader_sentiment_df.head()

In [None]:
vader_sentiment_df.value_counts('sentiment_label')

In [None]:
# specify where to place cleaned data and what to call it
cleaned_file_path = '../data/cleaned_data/vader_sentiment.csv'

#save data using the cleaned file path.
vader_sentiment_df.to_csv(cleaned_file_path, index=False)

In [None]:
vader_df = pd.read_csv('../data/cleaned_data/vader_sentiment.csv')

### Now run Vader through the Bag of Words vectorization model

In [None]:
#fill any missing values with an empty str
vader_df['cleaned_text'] = vader_df['cleaned_text'].fillna('')

In [None]:
#Pre-Process the data using bag of words Vectorization-Based Model
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(vader_df['cleaned_text'])

In [None]:
#Splitting the data into trainig and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, vader_df['vader_sentiment_label'], test_size=0.25, random_state=5)

In [None]:
#train the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [None]:
#make predictions
predicted = MNB.predict(X_test)

In [None]:
# Calculate accuracy
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

In [None]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

## Compare the 3 csv files. Bert, Vader, and Textblob

In [None]:
#read in the files
vader_df = pd.read_csv("../data/cleaned_data/vader_sentiment.csv")
bert_df = pd.read_csv("../data/cleaned_data/bert_sentiment_analysis_50000.csv")
textblob_df = pd.read_csv("../data/cleaned_data/text_blob_sentiment.csv")

In [None]:
vader_df = vader_df.head(15000)
textblob_df = textblob_df.head(15000)
bert_df = bert_df.head(15000)

In [None]:
bert_df.value_counts('sentiment_labels')

In [None]:
merged_df = pd.merge(vader_df, textblob_df, on='cleaned_text', how='inner')
merged_df.head()

In [None]:
merged_df = merged_df.dropna()

In [None]:
merged_df

In [None]:
merged_df = pd.merge(merged_df, bert_df, on='cleaned_text', how='inner')
merged_df.head()

In [None]:
merged_df = merged_df.dropna()

In [None]:
import numpy as np
# Create a new column 'matching_label' with default value 0
merged_df['matching_label'] = 0

# Conditions to check for matching labels
condition1 = (merged_df['vader_sentiment_label'] == merged_df['textblob_sentiment_lables']) & (merged_df['vader_sentiment_label'] == merged_df['sentiment_labels'])
condition2 = (merged_df['vader_sentiment_label'] == merged_df['textblob_sentiment_lables']) & (merged_df['vader_sentiment_label'] != merged_df['sentiment_labels'])
condition3 = (merged_df['vader_sentiment_label'] != merged_df['textblob_sentiment_lables']) & (merged_df['vader_sentiment_label'] == merged_df['sentiment_labels'])
condition4 = (merged_df['vader_sentiment_label'] != merged_df['textblob_sentiment_lables']) & (merged_df['textblob_sentiment_lables'] == merged_df['sentiment_labels'])

# Set values based on conditions
merged_df['matching_label'] = np.where(condition1, 'All 3 Match',
                           np.where(condition2, 'Vader, Textblob Match',
                           np.where(condition3, 'Vader, Bert Match',
                           np.where(condition4, 'Textblob, Bert Match', 'None Match'))))

merged_df.head()


In [None]:
merged_df

In [None]:
merged_df.value_counts('matching_label')

In [None]:
# Filter out rows where all three labels match
filtered_df = merged_df[merged_df['matching_label'] != ""]


In [None]:
# Count the occurrences of each matching_label category after filtering
label_counts = filtered_df['matching_label'].value_counts()
label_counts

In [None]:
import matplotlib.pyplot as plt
# Create a bar plot
plt.figure(figsize=(8, 6))
label_counts.plot(kind='bar', color='skyblue')
plt.xlabel('Matching Labels')
plt.ylabel('Count')
plt.title('Counts of Matching Labels (Excluding All 3 Match)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()