# PHYS 243 Final Project

In [1]:
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

## 1-Neural Networks

## 2-Twitter Sentiment Analysis Dataset
- Task: Classify sentiment (positive, negative, neutral)

In [2]:
train_df = pd.read_csv('/content/twitter_training.csv')
test_df = pd.read_csv('/content/twitter_validation.csv')

### Data Cleaning/ Preprocessing

In [3]:
train_df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [4]:
test_df.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp I’ve had Madeleine McCann in my cel...


In [5]:
train_df.duplicated().sum()

2700

In [6]:
test_df.duplicated().sum()

0

In [7]:
train_df.isnull().sum()

Unnamed: 0,0
2401,0
Borderlands,0
Positive,0
"im getting on borderlands and i will murder you all ,",686


In [8]:
test_df.isnull().sum()

Unnamed: 0,0
3364,0
Facebook,0
Irrelevant,0
"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣",0


In [9]:
train_df.isna().sum()

Unnamed: 0,0
2401,0
Borderlands,0
Positive,0
"im getting on borderlands and i will murder you all ,",686


In [10]:
test_df.isna().sum()

Unnamed: 0,0
3364,0
Facebook,0
Irrelevant,0
"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣",0


##### Add new column names to both training and testing datasets

In [11]:
# Set new column names for both training and validation datasets
column_names = ["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"]

train_df.columns = column_names
test_df.columns = column_names

In [12]:
train_df.head(2)

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


In [13]:
test_df.head(2)

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...


In [14]:
# filled NA rows under the Tweet_Content column as empty strings
train_df['Tweet_Content'].fillna('', inplace=True)
train_df.isna().sum()

Unnamed: 0,0
Tweet_ID,0
Entity,0
Sentiment,0
Tweet_Content,0


In [15]:
train_df.shape

(74681, 4)

In [16]:
test_df.shape

(999, 4)

Issue: Both testing and validation datasets are missing a column header row. Additionally, the training dataset has 2700 duplicated rows

##### Removed duplicates from training dataset
- Note: Although the "Tweet_ID" column does not have unique values per row, it is not relevant in modeling since we often leave out unique identifiers out (left as is)

In [17]:
train_df.duplicated().sum()

2700

In [18]:
train_df.drop_duplicates(inplace=True)

In [19]:
train_df.duplicated().sum()

0

In [20]:
train_df.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [21]:
train_df.shape

(71981, 4)

##### Change "Irrelevant" to "Neutral" for the Sentiment column

In [22]:
train_df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Negative,21787
Positive,19810
Neutral,17800
Irrelevant,12584


In [23]:
test_df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,285
Positive,277
Negative,266
Irrelevant,171


In [24]:
train_df.loc[train_df["Sentiment"] == "Irrelevant", "Sentiment"] = "Neutral"

In [25]:
train_df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,30384
Negative,21787
Positive,19810


In [26]:
test_df.loc[test_df["Sentiment"] == "Irrelevant", "Sentiment"] = "Neutral"

In [27]:
test_df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,456
Positive,277
Negative,266


##### Text cleaning and Tokenization (spaCy , TextBlob)
- Tokenization
- Lowercasing & removing punctuation
- Removing stop words (the, is, I, etc.)
- Removing Non-Alphanumeric Characters (removing numbers and special characters)
- Stemming and Lemmatization

In [28]:
train_df["Tweet_Content"]

Unnamed: 0,Tweet_Content
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...
...,...
74676,Just realized that the Windows partition of my...
74677,Just realized that my Mac window partition is ...
74678,Just realized the windows partition of my Mac ...
74679,Just realized between the windows partition of...


In [29]:
print(train_df["Tweet_Content"].info())
print(test_df["Tweet_Content"].info())

<class 'pandas.core.series.Series'>
Index: 71981 entries, 0 to 74680
Series name: Tweet_Content
Non-Null Count  Dtype 
--------------  ----- 
71981 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB
None
<class 'pandas.core.series.Series'>
RangeIndex: 999 entries, 0 to 998
Series name: Tweet_Content
Non-Null Count  Dtype 
--------------  ----- 
999 non-null    object
dtypes: object(1)
memory usage: 7.9+ KB
None


In [30]:
# Set datatype to string
train_df["Tweet_Content"].astype(str)
test_df["Tweet_Content"].astype(str)

Unnamed: 0,Tweet_Content
0,BBC News - Amazon boss Jeff Bezos rejects clai...
1,@Microsoft Why do I pay for WORD when it funct...
2,"CSGO matchmaking is so full of closet hacking,..."
3,Now the President is slapping Americans in the...
4,Hi @EAHelp I’ve had Madeleine McCann in my cel...
...,...
994,⭐️ Toronto is the arts and culture capital of ...
995,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
996,Today sucked so it’s time to drink wine n play...
997,Bought a fraction of Microsoft today. Small wins.


In [31]:
# Convert all text in lowercase
train_df["Tweet_Content"] = train_df["Tweet_Content"].str.lower()
test_df["Tweet_Content"] = test_df["Tweet_Content"].str.lower()

In [32]:
train_df["Tweet_Content"].head()

Unnamed: 0,Tweet_Content
0,i am coming to the borders and i will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...


In [33]:
test_df["Tweet_Content"].head()

Unnamed: 0,Tweet_Content
0,bbc news - amazon boss jeff bezos rejects clai...
1,@microsoft why do i pay for word when it funct...
2,"csgo matchmaking is so full of closet hacking,..."
3,now the president is slapping americans in the...
4,hi @eahelp i’ve had madeleine mccann in my cel...


In [34]:
"""
# Tokenization (test and train datasets)
import spacy
nlp = spacy.load("en_core_web_sm")

train_df["Tweet_Content"] = train_df["Tweet_Content"].apply(lambda content: [token.text for token in nlp(content)])
test_df["Tweet_Content"] = test_df["Tweet_Content"].apply(lambda content: [token.text for token in nlp(content)]) """

'\n# Tokenization (test and train datasets)\nimport spacy\nnlp = spacy.load("en_core_web_sm")\n\ntrain_df["Tweet_Content"] = train_df["Tweet_Content"].apply(lambda content: [token.text for token in nlp(content)])\ntest_df["Tweet_Content"] = test_df["Tweet_Content"].apply(lambda content: [token.text for token in nlp(content)]) '

In [35]:
train_df["Tweet_Content"]

Unnamed: 0,Tweet_Content
0,i am coming to the borders and i will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...
...,...
74676,just realized that the windows partition of my...
74677,just realized that my mac window partition is ...
74678,just realized the windows partition of my mac ...
74679,just realized between the windows partition of...


In [36]:
test_df["Tweet_Content"]

Unnamed: 0,Tweet_Content
0,bbc news - amazon boss jeff bezos rejects clai...
1,@microsoft why do i pay for word when it funct...
2,"csgo matchmaking is so full of closet hacking,..."
3,now the president is slapping americans in the...
4,hi @eahelp i’ve had madeleine mccann in my cel...
...,...
994,⭐️ toronto is the arts and culture capital of ...
995,this is actually a good move tot bring more vi...
996,today sucked so it’s time to drink wine n play...
997,bought a fraction of microsoft today. small wins.


In [37]:
# Text vectorization (bag of words)
# Tokenize, vectorize, and remove stop words from the dataset
# Create a new vectorized dataset

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')

x = vectorizer.fit_transform(train_df["Tweet_Content"])
dense_array_train = x.toarray()
feature_names = vectorizer.get_feature_names_out()

train_vectorized = pd.DataFrame(dense_array_train, columns=feature_names)

# Perform the same operations on the testing dataset
y = vectorizer.transform(test_df["Tweet_Content"])
dense_array_test = y.toarray()
feature_names = vectorizer.get_feature_names_out()

test_vectorized = pd.DataFrame(dense_array_test, columns=feature_names)

In [38]:
train_vectorized.head()

Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,اللعبه,حبيت,خلاص,عبر,فيديو,٥υ,घरच,การออกอากาศของฉ,นจาก,ℐℓ٥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
test_vectorized.head()

Unnamed: 0,00,000,00011,00014,00015,00016,00054,00105,00107,00303,...,اللعبه,حبيت,خلاص,عبر,فيديو,٥υ,घरच,การออกอากาศของฉ,นจาก,ℐℓ٥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Concat vectorized training dataframe with original dataset

train_df = pd.concat([train_df[["Entity", "Sentiment"]], train_vectorized], axis=1)
test_df = pd.concat([test_df[["Entity", "Sentiment"]], test_vectorized], axis=1)


In [41]:
train_df.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [42]:
test_df.head()

Unnamed: 0,Tweet_ID,Entity,Sentiment,Tweet_Content
0,352,Amazon,Neutral,bbc news - amazon boss jeff bezos rejects clai...
1,8312,Microsoft,Negative,@microsoft why do i pay for word when it funct...
2,4371,CS-GO,Negative,"csgo matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,now the president is slapping americans in the...
4,6273,FIFA,Negative,hi @eahelp i’ve had madeleine mccann in my cel...


### Data Analysis

In [43]:
import matplotlib.pyplot as plt
import seaborn as sns

##### Probability Distribution of Sentiments

In [44]:
train_df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Neutral,30384
Negative,21787
Positive,19810


### Model Training and Testing
- Logistic Regression, Naive Bayes, SVM, Random Forest