In [2]:
# Import necessary libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

In [3]:
tweets = pd.read_csv("vaccination_tweets.csv")

In [4]:
messages = pd.read_csv("vaccination_tweets.csv", usecols = [8,10])

In [5]:
messages

Unnamed: 0,user_verified,text
0,False,Same folks said daikon paste could treat a cyt...
1,False,While the world has been on the wrong side of ...
2,False,#coronavirus #SputnikV #AstraZeneca #PfizerBio...
3,True,"Facts are immutable, Senator, even when you're..."
4,False,Explain to me again why we need a vaccine @Bor...
...,...,...
7140,False,The most recent vaccine data from Israel: #Pfi...
7141,False,So ist es‼️\n#impfschaden #Impftote #PfizerBio...
7142,False,One shot down! #CovidVaccine #PfizerBioNTech h...
7143,False,Stopping #vaccine deliveries to #Azerbaijan is...


In [6]:
LE = LabelEncoder()
messages['target'] = LE.fit_transform(messages["user_verified"])

In [7]:
X = messages['text']
y = messages['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

In [10]:
vectorizer = TfidfVectorizer(stop_words = stopwords_list)

In [11]:
tf_idf_train = vectorizer.fit_transform(X_train)
tf_idf_test = vectorizer.transform(X_test)

In [12]:
tf_idf_test

<1787x15196 sparse matrix of type '<class 'numpy.float64'>'
	with 19012 stored elements in Compressed Sparse Row format>

In [13]:
tf_idf_test_df = pd.DataFrame(tf_idf_test.toarray(), columns=vectorizer.vocabulary_.keys())
first_doc = tf_idf_test_df.loc[0]
first_doc.idxmax(axis=1)

'tropical'

In [14]:
first_doc['kill']

0.0

In [15]:
tf_idf_test_df

Unnamed: 0,80,amp,81,oldies,2nd,pfizerbiontech,covid19,vaccine,weekend,thanks,...,sleepy,nightsh,2c5lqgydfe,helennewland,rlepedvotj,diminish,9vb1k3vmtv,2scbz6xduw,vqyxdpr38z,l7usgbrxr3
0,0.0,0.231087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1783,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1784,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1785,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
stop_words=set(stopwords.words("english"))
print(stop_words)

{'above', 'mightn', 'been', 'those', 'i', 'into', 'no', 've', 'then', 'wouldn', 'below', 'how', 'where', 'theirs', 'the', 'once', 'here', 'won', 'all', 'my', 'do', 'she', "weren't", 'weren', 'while', 'couldn', 'are', 'over', 'd', 'and', "you're", "hasn't", 'just', "didn't", 'after', 'but', 'by', 'didn', 's', 'hers', 'through', 'don', 'before', 'will', 'there', "mightn't", 'shouldn', 'who', "won't", 'myself', 'not', 'own', 't', 'ain', 'under', 'when', 'yourselves', 'themselves', 'during', "isn't", 'me', "shouldn't", 'was', 'ourselves', 'on', 'can', 're', 'he', 'being', 'out', "you'd", 'at', 'hadn', 'from', 'about', "that'll", 'm', 'ma', 'more', 'him', 'off', 'as', 'his', 'its', 'what', 'if', 'very', 'have', 'other', 'does', 'it', 'yourself', "wasn't", 'for', 'doesn', 'because', 'your', 'any', "hadn't", 'we', 'did', 'such', 'now', 'that', 'had', 'up', "you've", "shan't", 'against', 'so', 'haven', "couldn't", 'these', 'too', 'yours', 'is', 'with', 'in', 'same', 'has', 'their', 'should', '

In [17]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

## tokenized_review = tokenizer.tokenize(messages['text'])

In [18]:
tokenizer

RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [19]:
review = "Hi what is up"

In [20]:
review

'Hi what is up'

In [21]:
tokenized_review = tokenizer.tokenize(review)

In [22]:
print(tokenized_review)

['Hi', 'what', 'is', 'up']
