In [1]:
import pandas as pd
from textblob import TextBlob
from pathlib import Path
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Read in the data frame

In [2]:
#read in file
file_path = Path("../data/cleaned_data/cleaned_training_tweets.csv")

df = pd.read_csv(file_path, encoding='ISO-8859-1')
df.head()

Unnamed: 0,target,cleaned_text
0,0,upset update facebook texting might cry result...
1,0,dived many time managed save rest go bound
2,0,whole body feel itchy like fire
3,0,behaving see
4,0,whole crew


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 2 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   target        1599999 non-null  int64 
 1   cleaned_text  1563712 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [27]:
# Pre-proccess data.
import nltk
# Pre-Processing the text
def cleaning(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word.isalpha()])
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]
    return ' '.join(lemmatized_text)

df['cleaned_text'] = df['text'].apply(cleaning)


In [29]:
df = df.drop(columns=['text'])

In [30]:
cleaned_file_path = '../data/cleaned_data/cleaned_training_tweets.csv'
df.to_csv(cleaned_file_path, index=False)

## Copy the DataFrame to test both TextBlob and VaderSentiment

In [4]:
original_df = df.copy()

In [5]:
text_blob_df = df.copy()

In [6]:
#drop the target column
text_blob_df = text_blob_df.drop(columns=['target'])
text_blob_df.head()

Unnamed: 0,cleaned_text
0,upset update facebook texting might cry result...
1,dived many time managed save rest go bound
2,whole body feel itchy like fire
3,behaving see
4,whole crew


In [7]:
vader_sentiment_df = df.copy()

In [8]:
#drop the target column
vader_sentiment_df = vader_sentiment_df.drop(columns=['target'])
vader_sentiment_df.head()

Unnamed: 0,cleaned_text
0,upset update facebook texting might cry result...
1,dived many time managed save rest go bound
2,whole body feel itchy like fire
3,behaving see
4,whole crew


## Run original data set through Bag of Words Vectorization Based Model

In [38]:
#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(original_df['cleaned_text'])

In [39]:
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, original_df['target'], test_size=0.25, random_state=5)

In [40]:
#Training the model
from sklearn.naive_bayes import MultinomialNB

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [42]:
from sklearn import metrics

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.7342025


In [43]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.7345913371141636
Recall: 0.7342025
F1 Score: 0.7340973573064117


## TextBlob Sentiment analysis

In [11]:
# Define a function to perform sentiment analysis using TextBlob and convert to labels
def analyze_sentiment_blob(text):
    if isinstance(text, float):
        return 'neutral'
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity



# Apply the function to the 'text' column and create a new column 'sentiment_label'
text_blob_df['textblob_sentiment'] = text_blob_df['cleaned_text'].apply(analyze_sentiment_blob)
text_blob_df.head()

Unnamed: 0,cleaned_text,textblob_sentiment
0,upset update facebook texting might cry result...,0.0
1,dived many time managed save rest go bound,0.5
2,whole body feel itchy like fire,0.2
3,behaving see,0.0
4,whole crew,0.2


In [13]:
# Define a function to perform sentiment analysis using TextBlob and convert to labels
def analyze_sentiment_blob(text):
    if isinstance(text, float):
        return 'neutral'
    analysis = TextBlob(str(text))
    polarity = analysis.sentiment.polarity
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to the 'cleaned_text' column and create a new column 'textblob_sentiment'
text_blob_df['textblob_sentiment_lables'] = text_blob_df['cleaned_text'].apply(analyze_sentiment_blob)

text_blob_df.head()

Unnamed: 0,cleaned_text,textblob_sentiment,textblob_sentiment_lables
0,upset update facebook texting might cry result...,0.0,neutral
1,dived many time managed save rest go bound,0.5,positive
2,whole body feel itchy like fire,0.2,positive
3,behaving see,0.0,neutral
4,whole crew,0.2,positive


In [15]:
text_blob_df.value_counts('textblob_sentiment_lables')

textblob_sentiment_lables
neutral     825566
positive    546179
negative    228254
dtype: int64

In [16]:
# specify where to place cleaned data and what to call it
cleaned_file_path = '../data/cleaned_data/text_blob_sentiment.csv'

#save data using the cleaned file path.
text_blob_df.to_csv(cleaned_file_path, index=False)

In [48]:
#Pre-Process the data using bag of words Vectorization-Based Model
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(text_blob_df['cleaned_text'])

In [50]:
#Splitting the data into trainig and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, text_blob_df['textblob_sentiment'], test_size=0.25, random_state=5)

In [None]:
#train the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [39]:
#make predictions
predicted = MNB.predict(X_test)


In [40]:
# Calculate accuracy
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.755655


In [41]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.7634987820490563
Recall: 0.755655
F1 Score: 0.7378697866638233


## VaderSentiment analysis

In [53]:
sentiment = SentimentIntensityAnalyzer()
#create a function to apply to our data frame
def analyze_sentiment_vader (text):
    sentiment_score = sentiment.polarity_scores(text)
    return sentiment_score['compound']
#apply the created function to the copied vader_sentiment_df data frame    
vader_sentiment_df['sentiment_vader'] = vader_sentiment_df['cleaned_text'].apply(analyze_sentiment_vader)
vader_sentiment_df.head()

Unnamed: 0,cleaned_text,sentiment_vader
0,upset update facebook texting might cry result...,-0.6908
1,dived many time managed save rest go bound,0.4939
2,whole body feel itchy like fire,-0.25
3,behaving see,0.0
4,whole crew,0.0


In [54]:
vader_sentiment_df.value_counts('sentiment_vader')

sentiment_vader
0.0000    556922
0.4404     59640
0.3612     41160
0.6369     32634
0.4019     31749
           ...  
0.0348         1
0.0350         1
0.0358         1
0.0367         1
0.9985         1
Length: 8480, dtype: int64

In [56]:
df[:10]

Unnamed: 0,target,cleaned_text
0,0,upset update facebook texting might cry result...
1,0,dived many time managed save rest go bound
2,0,whole body feel itchy like fire
3,0,behaving see
4,0,whole crew
5,0,need hug
6,0,hey long time rain bit bit lol fine thanks
7,0,nope
8,0,que muera
9,0,spring break plain snowing


In [59]:
for index, rows in df[:10].iterrows():
    print(rows[1])
    


upset update facebook texting might cry result school today
dived many time managed save rest go bound
whole body feel itchy like fire
behaving see
whole crew
need hug
hey long time rain bit bit lol fine thanks
nope
que muera
spring break plain snowing


In [58]:
 #Convert Sentiment Scores to Categorical Labels
def convert_sentiment_vader(score):
    if score > 0.2:
        return 'positive'
    elif score < -0.2:
        return 'negative'
    else:
        return 'neutral'

# Apply the function to create a new column 'sentiment_label'
vader_sentiment_df['vader_sentiment_label'] = vader_sentiment_df['sentiment_vader'].apply(convert_sentiment_vader)
vader_sentiment_df.head()

Unnamed: 0,id,user,date,text,sentiment_vader,vader_sentiment_label
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,-0.75,negative
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,0.4939,positive
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,-0.25,negative
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",-0.4939,negative
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,0.0,neutral


In [77]:
vader_sentiment_df = vader_sentiment_df.drop(columns=['sentiment_vader'])
vader_sentiment_df.head()

Unnamed: 0,id,user,date,text,vader_sentiment_label
0,1467810672,scotthamilton,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...,negative
1,1467810917,mattycus,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...,positive
2,1467811184,ElleCTF,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire,negative
3,1467811193,Karoli,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all....",negative
4,1467811372,joy_wolf,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew,neutral


In [78]:
#Pre-Process the data using bag of words Vectorization-Based Model
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
text_counts = cv.fit_transform(vader_sentiment_df['text'])

In [79]:
#Splitting the data into trainig and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, vader_sentiment_df['vader_sentiment_label'], test_size=0.25, random_state=5)

In [80]:
#train the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [81]:
#make predictions
predicted = MNB.predict(X_test)

In [82]:
# Calculate accuracy
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ", accuracy_score)

Accuracy Score:  0.74526


In [83]:
# Calculate precision, recall, and F1-score
precision = metrics.precision_score(Y_test, predicted, average='weighted')
recall = metrics.recall_score(Y_test, predicted, average='weighted')
f1_score = metrics.f1_score(Y_test, predicted, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

Precision: 0.7472790617727212
Recall: 0.74526
F1 Score: 0.7392229063283018
