In [1]:
# Import necessary libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
tweets = pd.read_csv("vaccination_tweets.csv")

In [3]:
messages = pd.read_csv("vaccination_tweets.csv", usecols = [8,10])

In [4]:
messages

Unnamed: 0,user_verified,text
0,False,Same folks said daikon paste could treat a cyt...
1,False,While the world has been on the wrong side of ...
2,False,#coronavirus #SputnikV #AstraZeneca #PfizerBio...
3,True,"Facts are immutable, Senator, even when you're..."
4,False,Explain to me again why we need a vaccine @Bor...
...,...,...
7140,False,The most recent vaccine data from Israel: #Pfi...
7141,False,So ist es‼️\n#impfschaden #Impftote #PfizerBio...
7142,False,One shot down! #CovidVaccine #PfizerBioNTech h...
7143,False,Stopping #vaccine deliveries to #Azerbaijan is...


In [5]:
LE = LabelEncoder()
messages['target'] = LE.fit_transform(messages["user_verified"])

In [6]:
X = messages['text']
y = messages['target']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
stopwords_list = stopwords.words('english') + list(string.punctuation)

In [9]:
vectorizer = TfidfVectorizer(stop_words = stopwords_list)

In [10]:
tf_idf_train = vectorizer.fit_transform(X_train)
tf_idf_test = vectorizer.transform(X_test)

In [11]:
tf_idf_test

<1787x15196 sparse matrix of type '<class 'numpy.float64'>'
	with 19012 stored elements in Compressed Sparse Row format>

In [12]:
tf_idf_test_df = pd.DataFrame(tf_idf_test.toarray(), columns=vectorizer.vocabulary_.keys())
first_doc = tf_idf_test_df.loc[0]
first_doc.idxmax(axis=1)

'tropical'

In [13]:
first_doc['kill']

0.0

In [14]:
tf_idf_test_df

Unnamed: 0,80,amp,81,oldies,2nd,pfizerbiontech,covid19,vaccine,weekend,thanks,...,sleepy,nightsh,2c5lqgydfe,helennewland,rlepedvotj,diminish,9vb1k3vmtv,2scbz6xduw,vqyxdpr38z,l7usgbrxr3
0,0.0,0.231087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1782,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1783,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1784,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1785,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
stop_words=set(stopwords.words("english"))
print(stop_words)

{'whom', "couldn't", 'your', 'against', 'between', 'isn', "aren't", 'do', 'doing', 'before', "you've", 'doesn', 'into', 'nor', "haven't", 'm', 'through', 't', 'having', 'where', 'both', 'hasn', 're', 'then', 'an', 'himself', 'over', 'won', 'there', "hasn't", 'most', 'you', 'too', "hadn't", "weren't", "that'll", 'is', 'each', "mustn't", 'don', 'that', 'no', 'what', 'didn', 'does', 'yourselves', 'very', 'such', 'ma', 'up', 'me', 'being', 'hers', 'this', 'for', 'i', 'will', 'she', "won't", 'who', 'off', "wasn't", 'all', 'if', "you'd", "it's", "you'll", 'so', 'when', 'by', 'have', 'can', 'again', 'just', 'it', "isn't", "shan't", 'a', 'aren', 'yourself', 'out', 'until', 'at', 'theirs', 'some', 'o', 'our', 'couldn', "wouldn't", 'y', 'more', 'to', 'themselves', 'am', 'll', 'these', 'of', 'has', 'be', 'shan', 'my', 'on', 'once', 'not', 'further', 'itself', 'but', 'or', 'its', 'how', "didn't", 'under', 'and', 'with', "you're", 'any', 'they', 'myself', 'we', 'he', 'own', 'below', 'down', 've', '

In [16]:
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

## tokenized_review = tokenizer.tokenize(messages['text'])

In [17]:
tokenizer

RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [18]:
review = "Hi what is up"

In [19]:
review

'Hi what is up'

In [28]:
messages['RX'][0]

'Same folks said daikon paste could treat a cytokine storm #PfizerBioNTech https://t.co/xeHhIMg1kF'

In [29]:
tokenized_review = tokenizer.tokenize(messages['RX'][0])

In [30]:
print(tokenized_review)

['Same', 'folks', 'said', 'daikon', 'paste', 'could', 'treat', 'a', 'cytokine', 'storm', 'PfizerBioNTech', 'https', 't', 'co', 'xeHhIMg1kF']


In [22]:
messages

Unnamed: 0,user_verified,text,target
0,False,Same folks said daikon paste could treat a cyt...,0
1,False,While the world has been on the wrong side of ...,0
2,False,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,0
3,True,"Facts are immutable, Senator, even when you're...",1
4,False,Explain to me again why we need a vaccine @Bor...,0
...,...,...,...
7140,False,The most recent vaccine data from Israel: #Pfi...,0
7141,False,So ist es‼️\n#impfschaden #Impftote #PfizerBio...,0
7142,False,One shot down! #CovidVaccine #PfizerBioNTech h...,0
7143,False,Stopping #vaccine deliveries to #Azerbaijan is...,0


In [23]:
messages['RX'] = messages['text']

In [24]:
len(messages['text'])

7145

In [31]:
for i in range(len(messages['text'])):
    messages['RX'][i] = tokenizer.tokenize(messages['text'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  messages['RX'][i] = tokenizer.tokenize(messages['text'][i])


In [32]:
messages

Unnamed: 0,user_verified,text,target,RX
0,False,Same folks said daikon paste could treat a cyt...,0,"[Same, folks, said, daikon, paste, could, trea..."
1,False,While the world has been on the wrong side of ...,0,"[While, the, world, has, been, on, the, wrong,..."
2,False,#coronavirus #SputnikV #AstraZeneca #PfizerBio...,0,"[coronavirus, SputnikV, AstraZeneca, PfizerBio..."
3,True,"Facts are immutable, Senator, even when you're...",1,"[Facts, are, immutable, Senator, even, when, y..."
4,False,Explain to me again why we need a vaccine @Bor...,0,"[Explain, to, me, again, why, we, need, a, vac..."
...,...,...,...,...
7140,False,The most recent vaccine data from Israel: #Pfi...,0,"[The, most, recent, vaccine, data, from, Israe..."
7141,False,So ist es‼️\n#impfschaden #Impftote #PfizerBio...,0,"[So, ist, es, impfschaden, Impftote, PfizerBio..."
7142,False,One shot down! #CovidVaccine #PfizerBioNTech h...,0,"[One, shot, down, CovidVaccine, PfizerBioNTech..."
7143,False,Stopping #vaccine deliveries to #Azerbaijan is...,0,"[Stopping, vaccine, deliveries, to, Azerbaijan..."


In [26]:
for 
messages['RX'] = tokenizer.tokenize(messages['text'])

SyntaxError: invalid syntax (<ipython-input-26-145a894578cc>, line 1)