In [1]:
import pandas as pd
from textblob import TextBlob
from pathlib import Path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Read in the data frame

In [2]:
#read in file
file_path = Path("../data/cleaned_data/cleaned_training_tweets.csv")

df = pd.read_csv(file_path, encoding='ISO-8859-1')
df.head()

Unnamed: 0,id,user,date,text,target
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,0
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,0
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",0
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 5 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   id      1599999 non-null  int64 
 1   user    1599999 non-null  object
 2   date    1599999 non-null  object
 3   text    1599999 non-null  object
 4   target  1599999 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 61.0+ MB


## Copy the DataFrame to test both TextBlob and VaderSentiment

In [66]:
original_df = df.copy()

In [5]:
text_blob_df = df.copy()

In [None]:
#drop the target column
text_blob_df = text_blob_df.drop(columns=['target'])
text_blob_df.head()

In [54]:
vader_sentiment_df = df.copy()

In [55]:
#drop the target column
vader_sentiment_df = vader_sentiment_df.drop(columns=['target'])
vader_sentiment_df.head()

Unnamed: 0,id,user,date,text
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all...."
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew


## Run original data set through Bag of Words Vectorization Based Model

In [67]:
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(original_df['text'])

In [68]:
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, original_df['target'], test_size=0.25, random_state=5)

In [69]:
#Training the model
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [70]:
from sklearn import metrics

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.7682475


In [71]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.769103838786314
Recall: 0.7682475
F1 Score: 0.7680685679646639


## TextBlob Sentiment analysis

In [30]:
# Define a function to perform sentiment analysis using TextBlob and convert to labels
def analyze_sentiment_blob(text):
    analysis = TextBlob(text)
    sentiment_score = analysis.sentiment.polarity
    if sentiment_score > 0.2:
        return 'positive'
    elif sentiment_score < -0.2:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'text' column and create a new column 'sentiment_label'
text_blob_df['textblob_sentiment_label'] = text_blob_df['text'].apply(analyze_sentiment_blob)
text_blob_df.head()

Unnamed: 0,id,user,date,text,sentiment_blob,sentiment_label
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,0.0,neutral
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0.5,positive
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,0.2,neutral
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",-0.625,negative
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,0.2,neutral


In [33]:
text_blob_df.value_counts('sentiment_label')

sentiment_label
neutral     905103
positive    492753
negative    202143
dtype: int64

In [34]:
#Pre-Process the data using bag of words Vectorization-Based Model
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(text_blob_df['text'])

In [37]:
#Splitting the data into trainig and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, text_blob_df['sentiment_label'], test_size=0.25, random_state=5)

In [38]:
#train the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [39]:
#make predictions
predicted = MNB.predict(X_test)


In [40]:
# Calculate accuracy
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.755655


In [41]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.7634987820490563
Recall: 0.755655
F1 Score: 0.7378697866638233


## VaderSentiment analysis

In [56]:
sentiment = SentimentIntensityAnalyzer()
#create a function to apply to our data frame
def analyze_sentiment_vader (text):
    sentiment_score = sentiment.polarity_scores(text)
    return sentiment_score['compound']
#apply the created function to the copied vader_sentiment_df data frame    
vader_sentiment_df['sentiment_vader'] = vader_sentiment_df['text'].apply(analyze_sentiment_vader)
vader_sentiment_df.head()

Unnamed: 0,id,user,date,text,sentiment_vader
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,-0.75
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0.4939
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,-0.25
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",-0.4939
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,0.0


In [57]:
vader_sentiment_df.value_counts('sentiment_vader')

sentiment_vader
 0.0000    417713
 0.4404     30596
 0.3612     20066
-0.2960     18511
 0.4019     17227
            ...  
 0.0977         1
 0.0981         1
 0.0984         1
 0.0988         1
 0.9987         1
Length: 18266, dtype: int64

In [58]:
 #Convert Sentiment Scores to Categorical Labels
def convert_sentiment_vader(score):
    if score > 0.2:
        return 'positive'
    elif score < -0.2:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to create a new column 'sentiment_label'
vader_sentiment_df['vader_sentiment_label'] = vader_sentiment_df['sentiment_vader'].apply(convert_sentiment_vader)
vader_sentiment_df.head()

Unnamed: 0,id,user,date,text,sentiment_vader,vader_sentiment_label
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,-0.75,negative
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0.4939,positive
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,-0.25,negative
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",-0.4939,negative
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,0.0,neutral


In [59]:
vader_sentiment_df.head()

Unnamed: 0,id,user,date,text,sentiment_vader,vader_sentiment_label
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,-0.75,negative
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0.4939,positive
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,-0.25,negative
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",-0.4939,negative
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,0.0,neutral


In [60]:
#Pre-Process the data using bag of words Vectorization-Based Model
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(vader_sentiment_df['text'])

In [61]:
#Splitting the data into trainig and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, vader_sentiment_df['vader_sentiment_label'], test_size=0.25, random_state=5)

In [62]:
#train the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [63]:
#make predictions
predicted = MNB.predict(X_test)

In [64]:
# Calculate accuracy
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.74526


In [65]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.7472790617727212
Recall: 0.74526
F1 Score: 0.7392229063283018
