In [1]:
# Import necessary libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
tweets = pd.read_csv("vaccination_tweets.csv")

In [3]:
messages = pd.read_csv("vaccination_tweets.csv", usecols = [8,10])

In [15]:
messages

Unnamed: 0,user_verified,text,target
0,False,Same folks said daikon paste could treat a cyt...,0
1,False,While the world has been on the wrong side of ...,0
2,False,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,0
3,True,"Facts are immutable, Senator, even when you're...",1
4,False,Explain to me again why we need a vaccine @Bor...,0
...,...,...,...
7140,False,The most recent vaccine data from Israel: #Pfi...,0
7141,False,So ist es‼️\n#impfschaden #Impftote #PfizerBio...,0
7142,False,One shot down! #CovidVaccine #PfizerBioNTech h...,0
7143,False,Stopping #vaccine deliveries to #Azerbaijan is...,0


In [4]:
LE = LabelEncoder()
messages['target'] = LE.fit_transform(messages["user_verified"])

In [5]:
X = messages['text']
y = messages['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

In [8]:
vectorizer = TfidfVectorizer(stop_words = stopwords_list)

In [9]:
tf_idf_train = vectorizer.fit_transform(X_train)
tf_idf_test = vectorizer.transform(X_test)

In [10]:
tf_idf_test

<1787x15196 sparse matrix of type '<class 'numpy.float64'>'
	with 19012 stored elements in Compressed Sparse Row format>

In [11]:
tf_idf_test_df = pd.DataFrame(tf_idf_test.toarray(), columns=vectorizer.vocabulary_.keys())
first_doc = tf_idf_test_df.loc[0]
first_doc.idxmax(axis=1)

'tropical'

In [12]:
first_doc['kill']

0.0

In [13]:
tf_idf_test_df

Unnamed: 0,80,amp,81,oldies,2nd,pfizerbiontech,covid19,vaccine,weekend,thanks,...,sleepy,nightsh,2c5lqgydfe,helennewland,rlepedvotj,diminish,9vb1k3vmtv,2scbz6xduw,vqyxdpr38z,l7usgbrxr3
0,0.0,0.231087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1783,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1784,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1785,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
stop_words=set(stopwords.words("english"))
print(stop_words)

{'me', 'during', 'here', 'ourselves', 've', "she's", 'below', 'he', 'hers', 'she', 'be', 'there', 'from', "shouldn't", 'having', 'yourself', 'only', "that'll", 'hadn', "couldn't", 'hasn', 'mightn', 'by', "aren't", 'they', 'above', 'when', 'won', 'not', 'few', 'y', 'where', 'himself', "hadn't", 'o', 'ours', 'more', 'needn', 'in', "you'd", 'such', 'for', 'these', "shan't", 'him', 's', "you've", 'her', 'ma', 'being', 'as', 'theirs', 'through', 'after', 'on', 'any', 'no', 'we', 'my', 'am', 'had', "wouldn't", 're', "wasn't", 'why', 'very', 'weren', "don't", 'aren', "should've", 'each', 'or', 'who', 'own', 'just', 'shan', 'that', 'yourselves', 'how', 'your', 'off', 'i', 'up', 'a', 'if', 'you', 'did', "won't", 'didn', 'myself', 'been', 'couldn', 'do', 'most', 'can', 'and', 'themselves', 'yours', 't', 'd', 'their', 'don', 'has', "it's", 'both', 'wouldn', 'have', 'll', "isn't", 'further', 'once', 'haven', 'down', 'was', 'the', "hasn't", "doesn't", 'it', 'too', 'ain', 'then', 'which', 'm', 'does

In [16]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

tokenized_review = tokenizer.tokenize(messages['text'])

TypeError: expected string or bytes-like object

In [17]:
messages['text']

0       Same folks said daikon paste could treat a cyt...
1       While the world has been on the wrong side of ...
2       #coronavirus #SputnikV #AstraZeneca #PfizerBio...
3       Facts are immutable, Senator, even when you're...
4       Explain to me again why we need a vaccine @Bor...
                              ...                        
7140    The most recent vaccine data from Israel: #Pfi...
7141    So ist es‼️\n#impfschaden #Impftote #PfizerBio...
7142    One shot down! #CovidVaccine #PfizerBioNTech h...
7143    Stopping #vaccine deliveries to #Azerbaijan is...
7144    #AstraZeneca is safe. Believe the statistics, ...
Name: text, Length: 7145, dtype: object