# Load the dataset

In [1]:
import pandas as pd

df = pd.read_csv('dataset_Vaccine_Pfizer.csv')
df.head(), df.shape

(   id                                               Text  Subjectivity  \
 0   0  Historically there have been barriers in the h...      0.450000   
 1   1  Honored and PfizerProud to see the trust that ...      0.066667   
 2   2  COVID19 illuminated the need to improve equity...      0.000000   
 3   3  Today we published a landmark study on diversi...      0.000000   
 4   4  They volunteered🤚 to help change the world🌎 . ...      0.300000   
 
    Polarity    Target  
 0      0.35  Positive  
 1      0.00   Neutral  
 2      0.00   Neutral  
 3      0.00   Neutral  
 4      0.20  Positive  ,
 (1129, 5))

# EDA

In [2]:
df['length'] = df['Text'].apply(len)
df['length']

0       104
1       116
2       109
3       115
4       116
       ... 
1124    115
1125    115
1126    117
1127    114
1128     74
Name: length, Length: 1129, dtype: int64

In [3]:
df['Target'].value_counts()

Positive    549
Neutral     483
Negative     96
Name: Target, dtype: int64

In [4]:
df.isna().sum()

id              0
Text            0
Subjectivity    1
Polarity        1
Target          1
length          0
dtype: int64

In [5]:
df = df.dropna(subset = 'Target')
df

Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116
...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117


In [6]:
# Grouping the data by the class labels
grouped_data = df.groupby('Target')

# Displaying a few examples from each class to get a context of the text
for target, group in grouped_data:
    print(f"Class: {target}")
    print(group['Text'].head())
    print("\n")

Class: Negative
6      Since 1971  has supported the mental health of...
26     Tick-borne encephalitis (TBE) is a potentially...
67     In this episode of Behind the Sc(i)en(c)e, dis...
74     In this episode of Behind the Sc(i)en(c)e, one...
103    Cases of kidneycancer are increasing among all...
Name: Text, dtype: object


Class: Neutral
1    Honored and PfizerProud to see the trust that ...
2    COVID19 illuminated the need to improve equity...
3    Today we published a landmark study on diversi...
7    We are resolute in support of the  TimeToActCa...
9    Our novel prediction model, derived from machi...
Name: Text, dtype: object


Class: Positive
0     Historically there have been barriers in the h...
4     They volunteered🤚 to help change the world🌎 . ...
5     Bladder cancer is most common in patients over...
8     At Pfizer, we believe partnership is key to ad...
15    What progress looks like: &gt;1 million less c...
Name: Text, dtype: object




# Preprocessing the data

In [7]:
# Converting text to lowercase
# Lowercasing ensures that the model treats words like "Pfizer" and "pfizer" equally.

df['Cleaned_Text'] = df['Text'].str.lower()
df

Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically there have been barriers in the h...
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored and pfizerproud to see the trust that ...
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated the need to improve equity...
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today we published a landmark study on diversi...
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,they volunteered🤚 to help change the world🌎 . ...
...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,"in honor of worldarthritisday, we’re highlight..."
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living with rheumatoidarthritis and pso...
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,when it comes to working with your doctor to m...
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day of the girl; a worldwide revolution to ins...


In [8]:
# Removing Emojis using the emoji library as they do not contribute to the analysis
import emoji

df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: emoji.replace_emoji(x, replace=''))
df

Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically there have been barriers in the h...
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored and pfizerproud to see the trust that ...
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated the need to improve equity...
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today we published a landmark study on diversi...
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,they volunteered to help change the world . we...
...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,"in honor of worldarthritisday, we’re highlight..."
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living with rheumatoidarthritis and pso...
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,when it comes to working with your doctor to m...
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day of the girl; a worldwide revolution to ins...


In [9]:
# Removing URLs using regex as they are not useful for sentiment analysis

import re

df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda text: re.sub(r'http\S+|www.\S+', '', text))
df

Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically there have been barriers in the h...
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored and pfizerproud to see the trust that ...
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated the need to improve equity...
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today we published a landmark study on diversi...
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,they volunteered to help change the world . we...
...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,"in honor of worldarthritisday, we’re highlight..."
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living with rheumatoidarthritis and pso...
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,when it comes to working with your doctor to m...
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day of the girl; a worldwide revolution to ins...


In [10]:
# Removing Punctuation and Special Characters to focus only on meaningful words

import re

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML entities (e.g., &gt;, &amp;)
    text = re.sub(r'&\w+;', '', text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    
    return text

# Apply the function to the dataframe
df['Cleaned_Text'] = df['Cleaned_Text'].apply(clean_text)
df

Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically there have been barriers in the h...
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored and pfizerproud to see the trust that ...
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated the need to improve equity...
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today we published a landmark study on diversi...
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,they volunteered to help change the world wer...
...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,in honor of worldarthritisday were highlightin...
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living with rheumatoidarthritis and pso...
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,when it comes to working with your doctor to m...
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day of the girl a worldwide revolution to insp...


In [11]:
# Removing Stopwords to focus only on meaningful words.

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically barriers healthcare community fai...
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored pfizerproud see trust public placed us...
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated need improve equity clinic...
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today published landmark study diversity clini...
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,volunteered help change world sharing real kid...
...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,honor worldarthritisday highlighting people li...
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living rheumatoidarthritis psoriaticart...
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,comes working doctor manage rheumatoidarthriti...
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day girl worldwide revolution inspire girls ta...


In [12]:
# Tokenize the text to break text down into individual words (tokens)

from nltk.tokenize import word_tokenize
nltk.download('punkt')

df['Tokenized_Text'] = df['Cleaned_Text'].apply(word_tokenize)
df

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text,Tokenized_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically barriers healthcare community fai...,"[historically, barriers, healthcare, community..."
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored pfizerproud see trust public placed us...,"[honored, pfizerproud, see, trust, public, pla..."
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated need improve equity clinic...,"[covid19, illuminated, need, improve, equity, ..."
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today published landmark study diversity clini...,"[today, published, landmark, study, diversity,..."
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,volunteered help change world sharing real kid...,"[volunteered, help, change, world, sharing, re..."
...,...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,honor worldarthritisday highlighting people li...,"[honor, worldarthritisday, highlighting, peopl..."
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living rheumatoidarthritis psoriaticart...,"[people, living, rheumatoidarthritis, psoriati..."
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,comes working doctor manage rheumatoidarthriti...,"[comes, working, doctor, manage, rheumatoidart..."
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day girl worldwide revolution inspire girls ta...,"[day, girl, worldwide, revolution, inspire, gi..."


In [13]:
# Lemmatize the tokens to reduce words to their base or dictionary form

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

df['Lemmatized_Text'] = df['Tokenized_Text'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
df

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Shu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text,Tokenized_Text,Lemmatized_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically barriers healthcare community fai...,"[historically, barriers, healthcare, community...","[historically, barrier, healthcare, community,..."
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored pfizerproud see trust public placed us...,"[honored, pfizerproud, see, trust, public, pla...","[honored, pfizerproud, see, trust, public, pla..."
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated need improve equity clinic...,"[covid19, illuminated, need, improve, equity, ...","[covid19, illuminated, need, improve, equity, ..."
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today published landmark study diversity clini...,"[today, published, landmark, study, diversity,...","[today, published, landmark, study, diversity,..."
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,volunteered help change world sharing real kid...,"[volunteered, help, change, world, sharing, re...","[volunteered, help, change, world, sharing, re..."
...,...,...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,honor worldarthritisday highlighting people li...,"[honor, worldarthritisday, highlighting, peopl...","[honor, worldarthritisday, highlighting, peopl..."
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living rheumatoidarthritis psoriaticart...,"[people, living, rheumatoidarthritis, psoriati...","[people, living, rheumatoidarthritis, psoriati..."
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,comes working doctor manage rheumatoidarthriti...,"[comes, working, doctor, manage, rheumatoidart...","[come, working, doctor, manage, rheumatoidarth..."
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day girl worldwide revolution inspire girls ta...,"[day, girl, worldwide, revolution, inspire, gi...","[day, girl, worldwide, revolution, inspire, gi..."


In [14]:
# Join tokens back into a sentence, optional, if it is prefered to get the text back in a single string form

df['Final_Text'] = df['Lemmatized_Text'].apply(lambda tokens: ' '.join(tokens))
df

Unnamed: 0,id,Text,Subjectivity,Polarity,Target,length,Cleaned_Text,Tokenized_Text,Lemmatized_Text,Final_Text
0,0,Historically there have been barriers in the h...,0.450000,0.350000,Positive,104,historically barriers healthcare community fai...,"[historically, barriers, healthcare, community...","[historically, barrier, healthcare, community,...",historically barrier healthcare community fair...
1,1,Honored and PfizerProud to see the trust that ...,0.066667,0.000000,Neutral,116,honored pfizerproud see trust public placed us...,"[honored, pfizerproud, see, trust, public, pla...","[honored, pfizerproud, see, trust, public, pla...",honored pfizerproud see trust public placed u ...
2,2,COVID19 illuminated the need to improve equity...,0.000000,0.000000,Neutral,109,covid19 illuminated need improve equity clinic...,"[covid19, illuminated, need, improve, equity, ...","[covid19, illuminated, need, improve, equity, ...",covid19 illuminated need improve equity clinic...
3,3,Today we published a landmark study on diversi...,0.000000,0.000000,Neutral,115,today published landmark study diversity clini...,"[today, published, landmark, study, diversity,...","[today, published, landmark, study, diversity,...",today published landmark study diversity clini...
4,4,They volunteered🤚 to help change the world🌎 . ...,0.300000,0.200000,Positive,116,volunteered help change world sharing real kid...,"[volunteered, help, change, world, sharing, re...","[volunteered, help, change, world, sharing, re...",volunteered help change world sharing real kid...
...,...,...,...,...,...,...,...,...,...,...
1123,1124,"In honor of WorldArthritisDay, we’re highlight...",0.500000,0.136364,Positive,100,honor worldarthritisday highlighting people li...,"[honor, worldarthritisday, highlighting, peopl...","[honor, worldarthritisday, highlighting, peopl...",honor worldarthritisday highlighting people li...
1124,1125,People living with rheumatoidarthritis and pso...,0.650000,0.200000,Positive,115,people living rheumatoidarthritis psoriaticart...,"[people, living, rheumatoidarthritis, psoriati...","[people, living, rheumatoidarthritis, psoriati...",people living rheumatoidarthritis psoriaticart...
1125,1126,When it comes to working with your doctor to m...,0.000000,0.000000,Neutral,115,comes working doctor manage rheumatoidarthriti...,"[comes, working, doctor, manage, rheumatoidart...","[come, working, doctor, manage, rheumatoidarth...",come working doctor manage rheumatoidarthritis...
1126,1127,Day of the Girl; a worldwide revolution to ins...,0.500000,0.000000,Neutral,117,day girl worldwide revolution inspire girls ta...,"[day, girl, worldwide, revolution, inspire, gi...","[day, girl, worldwide, revolution, inspire, gi...",day girl worldwide revolution inspire girl tak...


# Feature extarction using tfidf

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [16]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000,  # Set the max number of features
                                   ngram_range=(1, 2),  # Consider both unigrams and bigrams
                                   min_df=5,            # Only consider terms that appear in at least 5 documents
                                   max_df=0.8,          # Ignore terms that appear in more than 80% of the documents
                                   stop_words='english') # Remove English stopwords if not removed already

In [17]:
# Apply TF-IDF to the 'Final_Text' column that contains the cleaned text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Final_Text'])

In [18]:
# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the resulting DataFrame
tfidf_df

Unnamed: 0,10,12,17,17th,20,2019,2020,20202021,2021,23,...,working,world,worldarthritisday,worldwide,yasmeen,yasmeen agosti,year,young,young chief,youre
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.352701,0.000000,0.000000,0.0,0.0,0.350694,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.491891,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.380383,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0
1126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.562954,0.0,0.0,0.000000,0.0,0.0,0.0


In [19]:
# Combine TF-IDF features with Subjectivity, Polarity, and Target from actual dataframe 'df'
df_combined = pd.concat([tfidf_df, df[['Subjectivity', 'Polarity', 'Target']].reset_index(drop=True)], axis=1)

# Display the combined DataFrame
df_combined

Unnamed: 0,10,12,17,17th,20,2019,2020,20202021,2021,23,...,worldwide,yasmeen,yasmeen agosti,year,young,young chief,youre,Subjectivity,Polarity,Target
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.450000,0.350000,Positive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.066667,0.000000,Neutral
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,Neutral
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,Neutral
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.350694,0.0,0.0,0.0,0.300000,0.200000,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.500000,0.136364,Positive
1124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.650000,0.200000,Positive
1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,Neutral
1126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.562954,0.0,0.0,0.000000,0.0,0.0,0.0,0.500000,0.000000,Neutral


# Model building using Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [21]:
# Define features (X) and target (y) from df_combined
X = df_combined.drop(['Target'], axis=1)
y = df_combined['Target']  
X.head(), y.head()

(    10   12   17  17th   20  2019  2020  20202021  2021   23  ...  \
 0  0.0  0.0  0.0   0.0  0.0   0.0   0.0       0.0   0.0  0.0  ...   
 1  0.0  0.0  0.0   0.0  0.0   0.0   0.0       0.0   0.0  0.0  ...   
 2  0.0  0.0  0.0   0.0  0.0   0.0   0.0       0.0   0.0  0.0  ...   
 3  0.0  0.0  0.0   0.0  0.0   0.0   0.0       0.0   0.0  0.0  ...   
 4  0.0  0.0  0.0   0.0  0.0   0.0   0.0       0.0   0.0  0.0  ...   
 
    worldarthritisday  worldwide  yasmeen  yasmeen agosti      year  young  \
 0                0.0        0.0      0.0             0.0  0.000000    0.0   
 1                0.0        0.0      0.0             0.0  0.000000    0.0   
 2                0.0        0.0      0.0             0.0  0.000000    0.0   
 3                0.0        0.0      0.0             0.0  0.000000    0.0   
 4                0.0        0.0      0.0             0.0  0.350694    0.0   
 
    young chief  youre  Subjectivity  Polarity  
 0          0.0    0.0      0.450000      0.35  
 1        

In [22]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shapes of the splits
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (789, 612)
Testing set shape: (339, 612)


In [23]:
# Instantiate the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [24]:
# Fit the model to the training data
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

# Obtaining predictions and model evaluation

In [25]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [26]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
[[ 23   6   0]
 [  0 145   0]
 [  0   1 164]]

Classification Report:
              precision    recall  f1-score   support

    Negative       1.00      0.79      0.88        29
     Neutral       0.95      1.00      0.98       145
    Positive       1.00      0.99      1.00       165

    accuracy                           0.98       339
   macro avg       0.98      0.93      0.95       339
weighted avg       0.98      0.98      0.98       339


Accuracy Score: 0.9793510324483776
