### Research Models

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('/Users/kavithaki/Downloads/Capstone/news.csv', delimiter =',')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,authors,title,publish_date,description,text,url
0,0,['Cbc News'],Coronavirus a 'wake-up call' for Canada's pres...,2020-03-27 08:00:00,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,https://www.cbc.ca/news/health/covid-19-drug-s...
1,1,['Cbc News'],Yukon gov't names 2 possible sources of corona...,2020-03-27 01:45:00,The Yukon government has identified two places...,The Yukon government has identified two places...,https://www.cbc.ca/news/canada/north/yukon-cor...
2,2,['The Associated Press'],U.S. Senate passes $2T coronavirus relief package,2020-03-26 05:13:00,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,https://www.cbc.ca/news/world/senate-coronavir...
3,3,['Cbc News'],Coronavirus: The latest in drug treatment and ...,2020-03-27 00:36:00,Scientists around the world are racing to find...,Scientists around the world are racing to find...,https://www.cbc.ca/news/health/coronavirus-tre...
4,4,['Cbc News'],The latest on the coronavirus outbreak for Mar...,2020-03-26 20:57:00,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,https://www.cbc.ca/news/the-latest-on-the-coro...


In [None]:
df.shape

In [7]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,3566.0
mean,2455.649748
std,1298.52945
min,0.0
25%,1473.25
50%,2496.5
75%,3569.75
max,4608.0


In [8]:
df.drop(["Unnamed: 0",'publish_date','url'], axis = 1, inplace = True)

In [9]:
df.head()

Unnamed: 0,authors,title,description,text
0,['Cbc News'],Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,['Cbc News'],Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,['The Associated Press'],U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,['Cbc News'],Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,['Cbc News'],The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


In [10]:
df['authors'] = df['authors'].str.strip('[]')

In [11]:
df.head()

Unnamed: 0,authors,title,description,text
0,'Cbc News',Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,'Cbc News',Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,'The Associated Press',U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,'Cbc News',Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,'Cbc News',The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


In [12]:
df['authors'] = df['authors'].str.strip('  ''')
df['authors'] = df.authors.str.replace("[({':]", "")

In [13]:
df.head()

Unnamed: 0,authors,title,description,text
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,Cbc News,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,Cbc News,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


In [14]:
df['authors'].nunique()

261

In [1]:
#df['authors'].unique()

## Explore the dataset

In [16]:
# What is the shape of the dataset?

print("Input data has {} rows and {} columns".format(len(df), len(df.columns)))

Input data has 3566 rows and 4 columns


In [17]:
# How many News Authors are there? Considering Authors as Label.

print("Out of {} rows, {} are CBC News, {} are Associated Press".format(len(df),
                                                       len(df[df['authors']=='Cbc News']),
                                                       len(df[df['authors']=='The Associated Press'])))

Out of 3566 rows, 1168 are CBC News, 178 are Associated Press


In [18]:
# How much missing data is there?

print("Number of null in label: {}".format(df['authors'].isnull().sum()))
print("Number of null in text: {}".format(df['text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


## NLP Basics: Implementing a pipeline to clean text

### Pre-processing text data

Cleaning up the text data is necessary to highlight attributes. These would be loaded to machine learning system to pick up on. Cleaning (or pre-processing) the data typically consists of a number of steps:
1. **Remove punctuation**
2. **Tokenization**
3. **Remove stopwords**
4. **Lemmatize/Stem**

In [19]:
df.head()

Unnamed: 0,authors,title,description,text
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,Cbc News,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,Cbc News,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


### Remove punctuation

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

In [22]:
df['body_text_clean'] = df['text'].apply(lambda x: remove_punct(x))

In [23]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...
3,Cbc News,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,Cbc News,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...


### Tokenization

In [24]:
import re

In [25]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [26]:
df['body_text_tokenized'] = df['body_text_clean'].apply(lambda x: tokenize(x.lower()))

In [27]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc..."
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,..."
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp..."
3,Cbc News,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ..."
4,Cbc News,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ..."


### Remove stopwords

In [28]:
import nltk

#from nltk.corpus import stopwords
#stopwords.words('english')

In [29]:
stopword = nltk.corpus.stopwords.words('english')
#stopword

In [31]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

In [32]:
df['body_text_nostop'] = df['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

In [33]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat..."
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w..."
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled..."
3,Cbc News,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ...","[scientists, around, world, racing, find, nove..."
4,Cbc News,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ...","[, trudeau, says, rules, quarantine, act, enfo..."


## Supplemental Data Cleaning: Using Stemming

In [34]:
import nltk

ps = nltk.PorterStemmer()   # Test out Porter stemmer

In [2]:
#dir(ps)

In [36]:
print(ps.stem('grows'))
print(ps.stem('growing'))
print(ps.stem('grow'))

grow
grow
grow


In [37]:
print(ps.stem('run'))
print(ps.stem('running'))
print(ps.stem('runner'))

run
run
runner


### Stem text

In [38]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [39]:
df['body_text_stemmed'] = df['body_text_nostop'].apply(lambda x: stemming(x))

In [None]:
# df.groupby(['authors']).sum()

### Supplemental Data Cleaning: Using a Lemmatizer

In [42]:
nltk.download()
import nltk

wn = nltk.WordNetLemmatizer()   # https://wordnet.princeton.edu/
ps = nltk.PorterStemmer()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [3]:
#dir(wn)

In [42]:
print(ps.stem('meanness'))
print(ps.stem('meaning'))

mean
mean


In [43]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

meanness
meaning


In [44]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [45]:
df['body_text_lemmatized'] = df['body_text_nostop'].apply(lambda x: lemmatizing(x))

### Applying punctuation, Tokenization, Remove stopwords, Lemmatization and Stemming

In [48]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

### Vectorizing Raw Data

Doubt:

3 types of vectorization or 4 Types ?

1. count Vectorization
2. N-gram Vectorization
3. TF-IDF 
4. Bag-Of-words Model

Bag-of-words model:
1.	Count how many times does a word occur in each message (Known as term frequency) (TF)
2.	Weigh the counts, so that frequent tokens get lower weight (inverse document frequency)- (IDF)
3.	Normalize the vectors to unit length, to abstract from the original text length (L2 norm)

### Count vectorization 

Creates a document-term matrix where the entry of each cell will be a count of the number of times that word occurred in that document.

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)  # call the function clean_text. 
X_counts = count_vect.fit_transform(df['text'])

# X_counts is the vectorized version of the data.

print(X_counts.shape)
#print(count_vect.get_feature_names())  # get_feature_names() prints all the unique words that are found in all of the text messages.

# (3566, 31399) : 3566 text messages, across this 3566 text messages there are 31399 unique words.
# It means the document term matrix consists of 3566 rows and 31399 columns.

# Each row : Text message and the column will have unique words of this text message.

# get_feature_names() : means the numbers are names of the columns. We can remove the numbers and change it to names.





(3566, 31399)


In [56]:
# Apply CountVectorizer to smaller sample

data_sample = df[0:20]

count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['text'])
print(X_counts_sample.shape)
#print(count_vect_sample.get_feature_names())

# (20, 2951) : 20 rows and 2951 columns.
# There are new feature names. 
# 
# 
print(count_vect_sample)


(20, 2951)
CountVectorizer(analyzer=<function clean_text at 0x1a229fe268>, binary=False,
                decode_error='strict', dtype=<class 'numpy.int64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=1.0,
                max_features=None, min_df=1, ngram_range=(1, 1),
                preprocessor=None, stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)


In [None]:
## The raw data output of the count vector is called as the Sparse Vector.


In [57]:
# Vectorizers output sparse matrices
#Sparse Matrix: A matrix in which most entries are 0. In the interest of efficient storage, 
#    a sparse matrix will be stored by only storing the locations of the non-zero elements.

X_counts_sample

<20x2951 sparse matrix of type '<class 'numpy.int64'>'
	with 7096 stored elements in Compressed Sparse Row format>

In [58]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df

# This is the document term Matrix. It has 2951 columns and 20 rows. 
# Values are numbered from 0 to 191 in the columns. 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2941,2942,2943,2944,2945,2946,2947,2948,2949,2950
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,1,2,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,1,1,0,1,1,...,1,0,0,0,0,0,1,0,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
X_counts_df.columns = count_vect_sample.get_feature_names()  # Assigning the feature names to the columns
X_counts_df

Unnamed: 0,Unnamed: 1,000,025,026,042,049,057,1,10,100,...,your,youv,yuje,yukon,z,zarychanski,zibi,ziomek,zmiyiwski,zone
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,1,2,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,1,1,0,1,1,...,1,0,0,0,0,0,1,0,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Bag Of Words Model

Count Vectorization

In [60]:
len(df)

3566

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
# Might take awhile...
train_data = df[0:1000]
bow_transformer = CountVectorizer(analyzer=clean_text).fit(train_data['text'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

17331


In [65]:
newstest1 = train_data['text'][3]
#print(newstest1)

In [None]:
bowtest1 = bow_transformer.transform([newstest1])
print(bowtest1)
print(bowtest1.shape)

In [131]:

#print(bow_transformer.get_feature_names())
#print(bow_transformer.get_feature_names()[9570])

In [67]:
news_bows = bow_transformer.transform(train_data['text'])

In [68]:
print('Shape of Sparse Matrix: ', news_bows.shape)
print('Amount of Non-Zero occurences: ', news_bows.nnz)

Shape of Sparse Matrix:  (1000, 17331)
Amount of Non-Zero occurences:  260806


In [134]:
sparsity = (100.0 * news_bows.nnz / (news_bows.shape[0] * news_bows.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 0


## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(news_bows)
tfidf4 = tfidf_transformer.transform(bowtest1)
print(tfidf4)

In [70]:
messages_tfidf = tfidf_transformer.transform(news_bows)
print(messages_tfidf.shape)

(1000, 17331)


## Training the model

In [71]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, train_data['authors'])

In [72]:
print('predicted:', spam_detect_model.predict(tfidf4)[0])
print('expected:', train_data.authors[3])

predicted: Cbc News
expected: Cbc News


Created a classification to predict CBC News

### Model Evaluation

In [73]:
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)

['Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' '' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' ''
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' ''
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' '' '' '' 'Cbc News'
 'Cbc News' 'Cbc News' 'C

In [74]:
from sklearn.metrics import classification_report
print (classification_report(train_data['authors'], all_predictions))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                            precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 0.80      0.51      0.62       270

  'precision', 'predicted', average, warn_for)


In [75]:
test_data  = df[1001:]

In [76]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

## N-gram Vectorization

In [47]:
from nltk.tokenize.punkt import PunktLanguageVars
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import trigrams

In [48]:
df.head(3)

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat...","[canadian, pharmaci, limit, much, medic, dispe...","[canadian, pharmacy, limiting, much, medicatio..."
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w...","[yukon, govern, identifi, two, place, whitehor...","[yukon, government, identified, two, place, wh..."
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled...","[senat, late, wednesday, pass, unparallel, 22,...","[senate, late, wednesday, passed, unparalleled..."


In [49]:
data = df['body_text_lemmatized'].to_string()

In [72]:
no_specials_string = re.sub('[!#?,.:";]', '', data)

In [77]:
no_specials_string=no_specials_string.replace('[','')

In [78]:
p=PunktLanguageVars()
tokens=p.word_tokenize(no_specials_string)

In [79]:
# See the last 10 pairs
for i in range(len(tokens)-30, len(tokens)-1):
    print(tokens[i], tokens[i+1])

korea wednesday
wednesday declared
declared formal
formal en
en 3562
3562 spread
spread mers
mers coronavirus
coronavirus infection
infection hospita
hospita 3563
3563 relic
relic bygone
bygone era
era wealthy
wealthy nova
nova scotians
scotians 3564
3564 camel
camel implicated
implicated middle
middle eastern
eastern respirato
respirato 3565
3565 world
world health
health organizationled
organizationled group
group expert


In [80]:
word_pairs = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
print(len(word_pairs))

gram2 = set(word_pairs)
print(len(gram2))

# Print 20 elements from gram2
gram2_iter = iter(gram2)
print([next(gram2_iter) for i in range(10)])

24322
17594
[('small', 'west'), ('ontario', 'city'), ('four', '1196'), ('canadian', 'tourist'), ('people', 'novel'), ('morning', 'day'), ('unit', 'wechu'), ('temporarily', 'amid'), ('1500bed', 'facil'), ('died', 'bea')]


In [63]:
gram1 = dict()

# Populate 1-gram dictionary
for tokens in tokens:
    if tokens in gram1:
        gram1[tokens] += 1
    else:
        gram1[tokens] = 1 # Start a new entry with 1 count since saw it for the first time.

# Turn into a list of (word, count) sorted by count from most to least
gram1 = sorted(gram1.items(), key=lambda tokens:tokens[1],reverse=True)
#gram1 = sorted(gram1.items(), key=lambda (word, count): -count)


# Print top 20 most frequent words
print(gram1[:20])

[('[', 3566), ('covid19', 321), ('health', 318), ('say', 254), ('case', 222), ('new', 190), ('coronavirus', 174), ('government', 129), ('canadian', 128), ('canada', 126), ('official', 117), ('public', 117), ('latest', 110), ('confirmed', 105), ('first', 103), ('people', 103), ('alberta', 98), ('two', 95), ('ontario', 93), ('bc', 85)]


In [81]:
gram2 = dict()

# Populate 2-gram dictionary
for i in range(len(tokens)-1):
    key = (tokens[i], tokens[i+1])
    if key in gram2:
        gram2[key] += 1
    else:
        gram2[key] = 1

# Turn into a list of (word, count) sorted by count from most to least

gram2 = sorted(gram2.items(), key=lambda key:key[1],reverse=True)
#gram2 = sorted(gram2.items(), key=lambda (_, count): -count)

# Print top 20 most frequent words
print(gram2[:60])

[(('case', 'covid19'), 99), (('health', 'official'), 83), (('public', 'health'), 73), (('nova', 'scotia'), 54), (('new', 'case'), 49), (('good', 'morning'), 42), (('morning', 'daily'), 42), (('daily', 'news'), 42), (('news', 'roundup'), 42), (('roundup', 'everythi'), 42), (('confirmed', 'case'), 39), (('prime', 'minister'), 37), (('new', 'brunswick'), 36), (('world', 'health'), 35), (('novel', 'coronavirus'), 32), (('health', 'officer'), 32), (('federal', 'government'), 29), (('chief', 'public'), 29), (('health', 'organization'), 29), (('minister', 'justin'), 28), (('justin', 'trudeau'), 27), (('tested', 'positive'), 25), (('covid19', 'pandemic'), 24), (('health', 'minister'), 24), (('northwest', 'territory'), 24), (('coronavirus', 'outbreak'), 22), (('chief', 'medical'), 22), (('thunder', 'bay'), 22), (('officer', 'dr'), 21), (('cruise', 'ship'), 21), (('around', 'world'), 20), (('provincial', 'health'), 20), (('case', 'coronavirus'), 19), (('first', 'nation'), 19), (('british', 'colu

In [None]:
b=bigrams(tokens)
[x for x in b]

In [None]:
t=trigrams(tokens)
[x for x in t]

In [None]:
five_gram=ngrams(tokens,7)
[x for x in five_gram]

In [82]:
start_word = tokens[len(tokens)//4]
print(start_word) 

889


In [83]:
def get2GramSentence(word, n = 500):
    for i in range(n):
        print(word)
        # Find Next word
        word = next((element[0][1] for element in gram2 if element[0][0] == word), None)
        if not word:
            break

word = start_word
print("Start word: %s" % word)

print("2-gram sentence: \"")
get2GramSentence(word, 10)
print("\"")

Start word: 889
2-gram sentence: "
889
medical
officer
dr
bonnie
henr
912
154resident
care
home
"


In [84]:
import random
def weighted_choice(choices):
   total = sum(w for c, w in choices)
   r = random.uniform(0, total)
   upto = 0
   for c, w in choices:
      if upto + w > r:
         return c
      upto += w
    
def get2GramSentenceRandom(word, n = 50):
    for i in range(n):
        print(word)
        # Get all possible elements ((first word, second word), frequency)
        choices = [element for element in gram2 if element[0][0] == word]
        if not choices:
            break
        
        # Choose a pair with weighted probability from the choice list
        word = weighted_choice(choices)[1]

In [85]:
for word in ['and', 'he', 'she', 'when', 'minister', 'never', 'i', 'how']:
    print("Start word: %s" % word)

    print("2-gram sentence: \"")
    get2GramSentenceRandom(word, 5)
    print("\"")

Start word: and
2-gram sentence: "
and
"
Start word: he
2-gram sentence: "
he
h
1842
people
rent
"
Start word: she
2-gram sentence: "
she
"
Start word: when
2-gram sentence: "
when
"
Start word: minister
2-gram sentence: "
minister
jim
zakreski
stuck
morocco
"
Start word: never
2-gram sentence: "
never
imagined
t
1080
fort
"
Start word: i
2-gram sentence: "
i
2211
habitat
humanity
edmonton
"
Start word: how
2-gram sentence: "
how
1726
u
1767
windsoressex
"


## Install TensorFlow Library Work Module

In [1]:
import tensorflow as tf

In [2]:
var = 0
def countWords(text):
    v = 0
    for i in text:
        v = v + 1
    print(v)
print(var)

0


In [3]:
words = []

In [36]:
# Stemmed vectors are taken as input. 
# Article size: 5 articles are taken.

articles = df.body_text_stemmed.tolist()
for article in articles:
    for word in article:
        words.append(word)
        
len(words)

1502567

In [37]:
words[1]

'pharmaci'

In [38]:
# Considering only unique words.
words = set(words)

In [39]:
len(words)

31399

In [40]:
vocab_size = len(words) # gives the total number of unique words

In [41]:
vocab_size

31399

In [42]:
word2int = {}
int2word = {}
for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [43]:
print(word2int['pharmaci'])
len(words)

30122


31399

In [44]:
dataTF = []

WINDOW_SIZE = 2

# based on the window size, the neighbouring word or context words is considered for the current word.
# So if the window size is 2, the surrounding words which are at a distance of two words from the current word is processed.

for article in articles[0:5]:
    for word_index, word in enumerate(article):
        for nb_word in article[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(article)) + 1] : 
            if nb_word != word:
                dataTF.append([word, nb_word])

In [45]:
dataTF[0:5]

[['canadian', 'pharmaci'],
 ['canadian', 'limit'],
 ['pharmaci', 'canadian'],
 ['pharmaci', 'limit'],
 ['pharmaci', 'much']]

In [46]:
dataTF[0]

['canadian', 'pharmaci']

In [55]:
dataTF[1]

['canadian', 'limit']

In [47]:
#word2int

In [48]:
word2int["canadian"]  # word2int[ data_word[0] ] = word2int["canadian"] = word2int[dataTF[0]]

6994

In [49]:
word2int["pharmaci"]

30122

In [50]:
word2int['covid19']

17536

In [53]:
vocab_size = len(words)
vocab_size

31399

In [56]:
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)     # Initializing the temp array with Zeros till the vocab size. like [ 0 0 0]
    temp[data_point_index] = 1    # assigning one value to the data point index. 
    return temp
x_train = [] # input word
y_train = [] # output word

In [57]:
for data_word in dataTF:
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))
# convert them to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [63]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [64]:
print(x_train.shape, y_train.shape)

# meaning 9272 training points, where each point has 1101 dimensions
# or 9272 rows and each row has 1101 columns. 

(9272, 31399) (9272, 31399)


In [58]:
# Run this in order to make tensor flow placeholder run on the machine
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [65]:
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [66]:
x

<tf.Tensor 'Placeholder_4:0' shape=(?, 31399) dtype=float32>

In [67]:
y_label

<tf.Tensor 'Placeholder_5:0' shape=(?, 31399) dtype=float32>

In [68]:
EMBEDDING_DIM = 5 # you can choose your own number
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias
hidden_representation = tf.add(tf.matmul(x,W1), b1)

In [69]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

In [None]:
# Started at 10.30 AM
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) #make sure you do this!
# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 10000
# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))
# 

### Output Of Tensor Flow
loss is :  15.396786
loss is :  15.060417
loss is :  14.7789545
loss is :  14.543843
loss is :  14.347865
loss is :  14.184861
loss is :  14.049553
loss is :  13.937419
loss is :  13.8446045
loss is :  13.767849
loss is :  13.704401
loss is :  13.651961
loss is :  13.6086
loss is :  13.572725
loss is :  13.543006
loss is :  13.518352
loss is :  13.497851
loss is :  13.480762
loss is :  13.466468
loss is :  13.454465
loss is :  13.444339
loss is :  13.435751
loss is :  13.428419
loss is :  13.422116
loss is :  13.4166565
loss is :  13.411886
loss is :  13.407679
loss is :  13.403934
loss is :  13.400566
loss is :  13.397502
loss is :  13.3947
loss is :  13.3920965
loss is :  13.389667
loss is :  13.387376
loss is :  13.385196
loss is :  13.383113
loss is :  13.381103
loss is :  13.379161
loss is :  13.377268
loss is :  13.375417
loss is :  13.373605
loss is :  13.371823
loss is :  13.370061
loss is :  13.368324
loss is :  13.366601
loss is :  13.364895
loss is :  13.363197
loss is :  13.361516
loss is :  13.3598385
loss is :  13.358171
loss is :  13.356513
loss is :  13.354857
loss is :  13.353209
loss is :  13.351566
loss is :  13.349928
loss is :  13.3482895
loss is :  13.346659
loss is :  13.345029
loss is :  13.343404
loss is :  13.341783
loss is :  13.340164
loss is :  13.338547
loss is :  13.336933
loss is :  13.335324
loss is :  13.333714
loss is :  13.332111
loss is :  13.330507
loss is :  13.328906
loss is :  13.32731
loss is :  13.325712
loss is :  13.32412
loss is :  13.322529
loss is :  13.320941
loss is :  13.319354
loss is :  13.317769
loss is :  13.316189
loss is :  13.314609
loss is :  13.31303
loss is :  13.311455
loss is :  13.309883
loss is :  13.3083105
loss is :  13.306741
loss is :  13.305176
loss is :  13.303611
loss is :  13.302046
loss is :  13.300488
loss is :  13.298933
loss is :  13.297376
loss is :  13.295823
loss is :  13.294272
loss is :  13.292721
loss is :  13.291174
loss is :  13.28963
loss is :  13.288084
loss is :  13.286546
loss is :  13.2850065
loss is :  13.283467
loss is :  13.281934
loss is :  13.2804
loss is :  13.278869
loss is :  13.277342
loss is :  13.275815
loss is :  13.274288
loss is :  13.272765
loss is :  13.271243
loss is :  13.269726
loss is :  13.268209
loss is :  13.266695
loss is :  13.265182
loss is :  13.263668
loss is :  13.262158
loss is :  13.260655
loss is :  13.259147
loss is :  13.257643
loss is :  13.256142
loss is :  13.2546425
loss is :  13.253143
loss is :  13.251648
loss is :  13.250155
loss is :  13.248662
loss is :  13.247171
loss is :  13.245683
loss is :  13.244196
loss is :  13.242712
loss is :  13.241231
loss is :  13.239747
loss is :  13.23827
loss is :  13.23679
loss is :  13.235315
loss is :  13.23384
loss is :  13.23237
loss is :  13.230898
loss is :  13.22943
loss is :  13.2279625
loss is :  13.226497
loss is :  13.225033
loss is :  13.223573
loss is :  13.222114
loss is :  13.220656
loss is :  13.2192
loss is :  13.217744
loss is :  13.216292
loss is :  13.214839
loss is :  13.213389
loss is :  13.211946
loss is :  13.210498
loss is :  13.209052
loss is :  13.207609

In [None]:
# For Articles = 100, Window Size = 5
# Started at 12.55 AM
# 

In [59]:
print(sess.run(W1))
print('----------')
print(sess.run(b1))
print('----------')

[[ 0.74582875  0.7793943   0.07386164  0.15888414 -1.0550942 ]
 [ 0.98273414  0.0811931   0.0202823   0.3914135  -0.6963007 ]
 [ 0.01152561 -0.315974   -0.21288878  0.15460855  0.8525657 ]
 ...
 [-0.5787782   2.55173    -0.3985736   0.58246446  0.26849777]
 [-0.8640642   0.09151337  0.37342748  0.8094642   0.5006471 ]
 [ 2.7362478  -0.53771824 -1.2462976   1.5780292  -0.5305438 ]]
----------
[0.2118817  0.09580352 0.28877154 0.22681047 0.16917028]
----------


In [60]:
vectors = sess.run(W1 + b1)

In [61]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))
def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [92]:
# We will now query these vectors with ‘king’, ‘queen’ and ‘royal’
print(int2word[find_closest(word2int['canadian'], vectors)])
print(int2word[find_closest(word2int['pharmaci'], vectors)])
print(int2word[find_closest(word2int['covid19'], vectors)])

packag
disappoint
follow


In [62]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

In [63]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
for word in words:
    print(word, vectors[word2int[word]][1])
    ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] ))
plt.show()

### output

-3.8947728
face -15.572499
anim 1.0498935
analysi -27.109291
market 22.697393
ship 2.409625
1000 16.15856
could 9.423725
critic 33.66415
remot -1.252983
leader 4.5464296
tumult -13.851024
afford -8.124293
entertain -0.89187616
warn -9.086381
forc -7.2234974
cap -25.375395
respiratori -10.305653
toxic 9.459612
10000 33.244785
spong -28.22298
start 24.725822
justin -29.294842
anyth -17.52146
basebal 28.959248
initi 29.42309
addit -10.913372
brief 16.302504
option -16.439035
specialist 28.440205
coronaviru 16.817432
pressur 25.393217
view -2.5587611
ordersanyon -17.755936
six 30.785965
kanopi -1.4266641
dispens 21.57296
deputi 17.981155
equal -1.5556762
blast -10.843041
stress -16.035044
right -19.541338
spiral 18.063726
wish -11.98849
treatment -31.669596
biotech 26.972506
whose -35.80998
alcohol 6.4318314
settl 0.11116588
trick 19.133835
journal 22.785318
suppli 21.73817
work 26.32507
permanent 16.523457
confin 29.671194
break 27.993616
attent -20.050459
appear -5.8178678
600 -9.580401
world 17.346323
show 31.455109
late 32.553104
urg -11.114392
reason 30.615229
chang -33.11709
ongo 17.960701
downtim 6.918359
death 23.673939
real -10.1678295
15 6.459705
indigen 5.1748877
outsid 16.793238
aid -33.100456
prioriti 10.72185
build -6.19005
new 23.305
wave -4.552723
accord -5.7945614
handl -18.53722
player -18.449028
fear -29.932814
recal -7.2523494
stockpil -34.20797
steven -5.2140117
activ -17.218107
effect -5.9528055
plasma -6.820815
got -31.68079
groceri -6.2828646
took 25.35496
herb 27.751522
school -15.071505
liber 32.577076
provid 0.48099956
academi -17.981722
system -7.9690113
chri 3.4714596
whether -19.074064
plu -33.59223
rama 1.66822
unparallel 22.030396
confid -35.85128
serious 3.6065392
frequenc 1.5122845
deliv 30.648716
daili -18.07945
medicin 9.691555
told -34.343742
cent -12.515008
parti 6.5599093
militari 11.418939
assur -18.601418
closer -35.06082
abroad 20.77212
wipe 3.5424101
america -15.642555
26 26.71067
rule 4.8976274
stori 4.9675593
rather 31.700907
thursday -15.517552
countri -18.377651
uber -33.360844
flood 4.161202
chines -15.07502
guojian -7.276832
program 4.5937657
usual 19.12232
recess -18.57307
stay 16.364798
hrefhttpstwittercomhashtagcovid19srchashamprefsrctwsrc5etfwcovid19a -15.826045
away 2.6416929
clinic 16.153465
modifi 21.104181
professor 13.609203
23 0.5475135
dr -28.544533
arduou -21.093592
via -25.529757
colleg -7.9589224
mutual 32.850803
detect -14.391002
someth -1.5780535
hadfield -15.724577
along 32.149727
2000 -14.118692
social -1.7205174
must -17.4777
either -7.8551292
367 1.3996584
match 2.6769483
money -16.381195
act -6.646839
18775366837 28.494816
youv -14.823979
move 7.4113984
differ 28.505821
hrefhttpstwittercommountsinainycrefsrctwsrc5etfwmountsinainyca -18.377415
minist -4.8454866
hoard 17.616692
fortun 2.2432811
fight -19.75196
pandem 16.249748
soap -5.8119373
addon -33.815514
werent 31.607344
greatest 32.458065
chief -17.533113
ever -3.5584967
practic -1.9969277
reveal -2.2101555
showcas -25.922194
tag -5.367168
62 -18.013254
latex -33.90543
identifi -16.346977
delay 8.912364
jason -16.257595
advertis 14.312098
spring -7.7615986
low -19.035254
apart 22.99572
reach -16.431496
toronto 29.867691
test 16.216919
array 33.434837
came 27.12673
like 25.374546
exist -32.648
opportun -18.46584
disinfect 8.601661
maneuv 15.75424
confront 31.294985
territori 24.755753
kid -2.8793895
date -15.290702
500 -20.949568
infus 30.395721
offici 0.5206058
local -25.90369
proven 29.734962
need 23.562151
recogn 1.2393029
shortag 1.4261906
primarili 4.46437
us 20.738022
say 25.030178
lie -12.1113205
phone 5.980362
nation -18.386213
place -12.425347
member -14.507787
visitor 23.279999
fail -1.6972376
distanc -14.240138
capitol 9.671462
indic -18.721016
thing 9.457739
month -11.505751
ad -16.911913
stairway -9.22552
trust 26.276707
recov -1.3804914
tax -27.33242
recreat -1.5652854
europ 4.714832
sheer -8.944153
trillion 25.066608
mediterranean -19.332047
chronic 26.829876
govern 2.1987731
claim -14.970053
pig -29.800701
go -30.346943
organ -19.54984
march 19.418058
canada 21.11102
would 20.612234
grow 32.779957
measur 1.4026223
annual 5.3779836
entir 30.04874
domin -14.088696
httpswwwcbccanewsentertainmentfreefamilystreamingcovid19stayathome15509556 7.5320745
tea -4.6992326
cardiac -10.922562
paid 28.698858
near -18.204117
therapist 18.23942
size -8.60597
promin -5.460238
risk -33.25693
hotspot 1.7265948
york 11.529317
sweep -4.938977
strict 33.19513
yukon 17.443993
someon -18.454906
20 -16.262749
viru -5.558914
arthriti -3.9727733
drive 30.3938
trial -33.926437
ventil 27.491957
mitig 5.0479207
tadrou -17.076666
driver 22.351284
share 0.08465342
furlough -3.3723373
grim 0.12969416
22 29.0373
polic -13.798709
compani 25.436533
find 2.6911027
import -2.68041
inform -4.635953
border 17.768269
far 17.443775
institut -2.6355493
trevor -7.1767035
avail 20.119516
toll 16.71203
progress 23.30644
lawmak 27.457792
riskiest -13.785322
timet 8.806904
amount -14.117776
possibl -24.220577
circul -14.931178
lifetim -33.333652
current -16.542831
louisiana -6.005495
mental -18.7818
network 21.860498
pelosi -25.626558
though 3.6115518
produc 0.878551
mnuchin -18.406988
20000 -11.885727
nanci -25.963097
crossbord 1.8401834
wholesal -15.559371
negoti -14.6727915
firststag 23.551878
nonessenti -11.521752
product 15.479489
expect -18.310598
titl -20.017532
univers -16.355003
cuomo -4.197407
alberta -28.425741
pend -19.618538
payrol 6.151725
uncheck -29.204702
kill -18.635353
canadian 19.94369
enact -19.276293
polit -8.237077
travel 26.902248
desper 23.313187
float 17.22472
region -10.492998
approv 14.796206
loan -3.054853
caution 32.23266
economi 17.71659
huge -27.986616
process -6.5082216
even -4.6037507
assiniboin -25.640913
alaska 5.837435
inhal -32.89969
individu 29.165283
servic -5.079104
weve 27.589882
hrefhttpstcobpsb2husqrhttpstcobpsb2husqra -30.941404
final 11.496814
impact 2.0386727
clean -1.4072748
open 33.302288
hydroxychloroquin 11.439013
that -4.829257
premier 17.12394
blood 15.6708355
close -31.733343
last -18.31042
emerg -12.7511015
minor -8.204013
controversi 16.057272
classic 5.797779
medic 1.2536595
one 25.598104
report 16.371956
malaria -5.2234235
payment -18.905922
treasuri 30.77355
unemploy 17.250877
illeg -26.622173
prescript -12.787328
inflammatori 27.236485
speaker -9.514521
respons -1.3081603
criteria 23.683147
damag -16.342306
certainli -15.140767
hypertens 29.825144
treat 14.620481
8891kilometrelong -5.402519
amid -15.941983
riskier -1.3305794
winnipeg -3.3831804
increasingli 5.8020163
retent -10.107691
million 6.6984973
investig -3.774707
ocasiocortez -17.58061
surfac -16.212107
trudeau 20.085588
use 32.079464
laid 7.810582
well 13.411316
scatter -4.356191
wealth -7.974658
drugmak -9.57427
news 4.5450816
time -17.152477
small -9.6388035
set 1.0846041
race 16.559793
readili -26.202202
research -27.32291
prior -6.947305
sourc 16.442179
anoth -18.342169
2400 -26.849384
nearli 26.804974
tradit -11.560839
tension -17.200674
written 3.860871
offic 10.911667
transfer -8.507994
decad -12.514435
rocket -8.660947
disappoint -20.440636
access 2.0908635
coverag 7.6388507
trip 18.667084
strongli 10.676129
effort -19.150816
power -5.6368117
case 21.211636
billion -9.948006
announc 11.562534
abl -17.50064
partner 32.63433
secretari 32.103817
space 31.642511
faucet 0.48586193
life 6.113346
went -2.674951
infect 25.652922
fallout 8.567117
tom 7.7804418
third 28.632456
director -9.109525
80 -9.717181
tend -19.607021
precaut -12.160216
avoid -26.596395
engulf 28.344849
north 19.567972
washington -1.5562186
imag -36.839283
miss -11.289054
conjunct -32.73305
hiscox -8.655226
tubi -12.842979
democrat 3.7125235
doctor -20.127865
dental 3.7006128
ground 15.512758
275 -30.433779
order -13.194137
mice -12.688151
sunday -13.423932
present -34.14183
meant -25.309553
seen -18.869022
huang 25.42939
caus -5.8687363
educ 15.788202
benefit -14.34495
think 25.698496
worker 19.70712
failur -13.601567
acupunctur 23.224648
behçet -17.70636
fiveday 28.206226
airlin 0.7400691
4 -12.35232
peopl 16.857944
acetaminophen 30.838657
enrich 8.436592
bureau 19.11621
media -2.4767628
part -3.6305208
potenti 1.4640381
soar -15.0428505
guarante -9.471508
onetim 9.439838
pharmaci -20.315845
lupu 31.432587
roussycbc -9.014131
wont 16.123583
saskatoon 22.10305
disrupt 3.423729
purpos -17.713696
unfold 2.9799395
idl 28.615005
associ -9.08896
atop -14.4117985
repurpos 0.7455985
sensor 24.996725
fund 25.563194
pay -10.325695
packag 19.600466
provinci 1.7944487
commun -16.70147
involv -0.06718115
mcgill 2.28947
anybodi 19.193462
commerc 18.114134
children -14.95899
ardent 3.6228259
protect -35.020664
relief 27.568869
agenc 17.759882
period 18.634798
mayb -4.7147384
cart -17.49977
mlb 28.375269
acceler 20.963974
ask -20.624182
gareth 25.832184
troop 24.955914
159 16.86389
live -11.585336
shot 5.08312
cash 2.3218265
stayathom 6.3452897
child -1.215031
elia -6.8393126
particip -5.063104
kaiser 28.417892
white -32.74773
china 17.03693
craft -9.231749
tukkercbc 8.064554
congress -15.45315
mani 10.0599785
leagu -0.9899088
actual -30.129282
destin 25.80644
game -31.313816
physician 5.1861663
rescu 4.920507
ford -29.27632
send 27.479183
learn 28.1085
wors -33.739296
challeng -14.534145
still -2.0695899
clear -9.29977
remov -16.09877
17 -13.991721
step 33.758762
gig -5.2130976
promis -8.12028
cloth -4.2516284
profession -31.001968
respond 33.51765
000 6.885783
50 3.2419953
infecti -2.964026
rough 31.75429
spent 23.008886
uniqu -3.4245002
squar 5.748646
histor 0.6784004
return 19.683466
sombr 30.972498
marri -29.646135
intend -36.512665
top -15.116587
free -13.922956
worri -27.444128
believ -10.416401
content -17.25126
state 33.46067
feder -19.055223
enforc -2.5882108
earn -6.9162116
disclos -9.742575
stimul 3.29236
choos 6.142628
grew 18.159847
get 21.488766
road -17.526188
three 18.180328
morn -13.076427
manitoba -26.843935
station 17.782831
unlimit 24.616518
dig -15.673651
safeti -9.106284
may -15.547993
made 23.914413
film 29.972021
side -14.680193
foolish -30.685654
permit 7.95462
job 21.46437
demand -2.3249133
andrew -24.162825
seri 4.118215
intern 16.57349
idea 12.4947405
colchicin -34.10447
hidden 32.3176
recent 30.71777
today -13.223642
expos -29.24593
none -12.78919
realiti -10.450256
legisl 16.483448
help 21.488348
timelin 22.356125
further -14.464768
ont -22.135565
optimist -12.8739
sar -13.187524
doorknob 30.176352
lab -9.858175
inclus 11.9714575
sprawl 1.3591675
latest -8.484853
interview -33.06669
peter -12.711999
zone -15.945154
intercept 10.1022005
16 -17.719698
alexandria -9.238867
2nd 24.92055
danger -8.9199915
refillsnew 0.1466083
studi 22.260529
difficulti -17.035416
implement -16.934446
plastic -23.792915
coronavirus 4.4263544
click 1.2136499
forth -16.833132
publichealth -10.391845
incent -8.150625
lin -18.83995
way 23.866896
digit 33.97264
ensur 30.867874
duffin 31.266972
paul -15.649566
ryan 12.042307
hill -25.355774
quarantin -13.13255
four -7.6349006
covid19 18.344332
exhaust 4.5867405
spread -11.870212
export 19.234625
hope -18.223974
theyr -17.240114
eye 15.5648365
antibodyrich 24.956993
necessari -33.222885
isol -6.804373
unlik -13.460129
among 16.327162
selfisol 31.839714
virus 3.8275561
friday 5.3864594
american 1.5239727
internet 23.784576
fever -18.992514
talk 23.51098
subscript 24.351318
agent 0.7456504
presid 29.976793
porou -4.6514935
insist 1.61358
cough 17.395372
senat 14.755989
speed -6.961537
water -19.963717
stand 16.42913
select -14.9763565
long 25.808804
volunt 5.544979
goe -11.571717
react 27.233957
plasmapheresi -6.6998863
stuff 30.563356
adsupport -21.027283
alreadi 20.635565
releas -1.3038877
control -1.7891725
togeth -10.464483
season 2.647071
keep 23.590977
adult -18.934694
healthi -20.580042
tell 24.264835
employe 29.623692
variou 1.334846
regulatori -8.150837
object 30.260185
chamber -22.964468
im -12.537375
rise -15.6260195
migrant 6.194014
read 26.863983
given -20.151218
diseas 14.212749
130 26.54631
halt 10.528675
dont -32.117283
minnesota 23.251085
slow -14.41109
experiment 31.699501
vast -21.986443
sinc -27.039385
later -13.250293
origin 4.528752
heavili -1.1339645
drug -30.093702
epidemiologist -33.36337
gener 21.261997
mouth -22.516342
scientist -35.873505
subscrib 4.737094
larger -12.482239
ka -3.9261026
recommend 18.843178
doug -19.647247
antibiot -17.249355
antibodi -16.374819
afloat 16.075285
toilet 2.8550787
also 20.672037
vulner -32.548874
see -13.120522
ban 33.431263
subject 29.791315
ottawa 30.45169
best -12.205437
anticip -21.21187
much 3.9843097
misgiv -6.95119
conserv 0.660898
coupl 2.0004418
sport -12.3013735
herd 20.390612
inflamm 11.29848
address 1.2051905
context -26.786682
queen -15.342745
around 22.91658
although -9.324275
earli 5.9822173
copleygetti 28.424747
catalogu -16.895721
writer -15.799599
symptom -5.26743
wednesday 5.244253
bordercross -34.23805
montreal -30.304932
concern 19.15063
tri 29.185349
paychequ -2.4197903
industri -7.688231
sign 6.134816
arent 27.364927
freeland -34.784477
drama -17.578344
manufactur 11.111184
remain 4.3483324
becom -19.61015
worldwid 34.01367
pass -12.60759
quickli 18.36034
budget -13.933162
6000 28.87408
cbc -0.5986937
platform 26.129534
goahead -27.707893
seriou -19.5339
curiou -18.991585
requir -10.158467
salari -12.165679
pharmaceut -18.890993
expand 18.505241
launch -11.064249
safe -16.512468
articl -15.061017
allow -30.296923
unauthor 24.05967
mcconnel -14.80554
action 15.632787
811 -26.356482
crisi -29.367746
fingertip -11.899186
kenney -7.4881864
year -6.3183413
45 -33.815784
goal -6.741038
residenti -9.245554
restless -0.3423008
come 2.450402
daybreak 6.2411795
india 8.033243
interact -19.476255
unnecessari 28.023136
center 25.6469
frequent -3.6357782
expert -17.763004
pharmacist -9.224574
major -9.467506
difficult 5.3674173
mina -24.782946
number -18.047256
posit -15.072075
supplier -13.66142
scene -16.243608
shut 4.4828553
econom 31.867954
copay -16.440952
popul 11.821174
retain -26.41266
battl -9.307753
drink 24.95379
cannot 21.799768
vaccin 17.66803
public 23.523039
eventu -5.0493736
150000 11.814204
begin -5.1323147
trade -9.847184
sick -3.5123818
cover 31.386969
track -11.5770645
blend 26.81797
mislead 4.4110937
here 22.17565
focus -20.241213
outbreak 3.22317
per 15.683399
base -18.966576
compli 19.06111
14 -16.657051
secur -4.1326256
rheumatoid -36.39707
ill -2.0389738
second 26.279608
cattl 29.944613
whitehors 21.54646
centuryold 8.104972
astronaut -31.792763
broad 28.730446
8 27.615168
4000 5.9088907
unwash -29.632307
includ 15.738359
burden -35.552666
quebec 28.910887
spokesperson 18.724125
prevent -17.98592
april -28.35252
fee 27.535082
deal 5.777824
especi 14.885817
old 29.056107
hurt 10.017166
unclear -1.3694985
request 23.889174
rich 7.2149005
back 33.600864
essenti 29.069126
misinform 10.14035
glove 32.008663
globe 34.785057
encourag -34.277042
parent -3.536217
watch 3.7644677
wasnt -29.145367
expertis 33.435978
prepar 10.283645
despit -0.68040794
herbal 16.936924
propos -13.624949
big -0.78120065
13 21.351345
sanit 2.8905737
famili 7.733819
anyon 31.139454
refil 8.370261
church 22.483494
subsid -34.44644
often 13.764066
gov 28.65405
lifesav -4.639816
know -27.59845
price -8.054366
cure -29.481346
monitor 0.61042094
mount 1.0475926
call -30.506243
voic 5.099595
unanim 14.831766
meet -32.234203
die 31.628153
hasnt -35.461697
despond -12.301587
stop 14.90086
short -31.722277
bill 4.2003856
administ 5.3417273
comput -19.578548
zarychanski -0.21901968
might -32.34897
lot 1.5671622
limit 30.26902
inexpens -30.355667
smooth 3.2745845
familyfriendli 26.140095
attribut -18.723997
breath -18.894825
heart 0.36949787
end 10.95723
take -29.96544
complic -18.103033
knowledg -3.050934
therapi -11.793388
lung -17.852203
certain 16.126308
rapid -36.470036
chain -0.77275074
kingston -0.7317434
documentari 28.66791
signal -9.543333
75000 5.5050864
steer 26.053679
relationship 29.910303
mitch -18.396368
hand 17.889376
kilometr -12.469393
ahead -8.72563
nair 10.77036
patch -1.6968119
librari 33.811916
make 5.173717
matter -13.675254
nose 3.0385387
ontario 3.0072162
largest 9.694927
agreement -7.2701316
sold -19.529985
confirm -11.897038
ingredi 17.147892
bipartisan 14.044764
scienc -30.571108
1325 -27.245216
ignor 6.285882
full 28.405983
oppos 20.146822
hous 28.82809
depth 23.73648
trump 19.270702
give 0.6376863
client 30.96847
mar 22.707613
other 28.161776
almost 21.076271
histori -12.609159
patient 1.3026972
mind -7.947359
1218 20.46387
camera -18.869518
wakeup 20.871012
neutral -34.64673
earlier -23.897842
wide -6.950249
gear -4.478115
factori 2.2868881
jacalyn -28.048223
9 33.362488
acupuncturist -8.176975
magnitud -5.988417
brinecbc -33.27124
said 17.39328
antiinflammatori -22.947401
person -19.428497
human 1.0708736
donald -24.96009
realli -30.17023
exposur -27.117147
25 -5.7458334
half 4.743243
creat 29.06485
fals 24.280756
particular 24.644295
discuss 9.227274
bethani -28.798447
manag 31.334667
newslett -10.167983
reduc -3.3735647
store -10.999636
first -2.2421536
lowwag -1.7452968
seemingli -11.568828
recruit 25.62768
follow 17.88647
instead -17.405798
40 -31.37228
brunswick 8.790305
urgent -6.471865
home -20.5244
decis 6.060975
leav -5.321039
jeanclaud -14.657361
mandatori -6.825587
sooner -6.605658
30day -10.321042
estim 27.61538
brennerreut 29.074606
gout -28.299522
site -30.819107
command -7.1687274
signatur -23.548916
tardif 4.812214
clerk 30.466366
increas -30.816761
visit 22.687803
park 22.307154
email 24.190449
credit -13.54431
week -30.555353
spend -6.1079745
coppercontain -7.6975765
chrystia -19.266233
irregular 5.5015306
plenti -5.1321707
highway 22.413334
behind 29.070435
1609 -14.774031
60 27.938046
question -7.1137476
republican 1.6452045
antimicrobi 34.40666
1200 8.837398
paper -28.604992
provinc 0.5809646
financ -18.38561
enough -9.909388
busi 24.163889
whatev -2.5424306
sight -19.674543
your 18.52019
replac -28.019009
wash 21.883062
sever 32.7655
psycholog -24.89329
swung 6.229654
centr -4.050283
least -15.235255
piec 34.152798
covidcbcca 9.591919
sinai 15.828254
direct -1.7521113
introduc 19.7038
distributor -21.228828
foreign -5.485772
food 2.229073
surviv 30.724573
chuck -6.7475853
provis -10.015209
transmiss -26.797262
health 20.447643
democraticcontrol -24.001017
novel -2.789692
hospit 21.681131
prime -3.9378242
patent -10.645904
invinc -1.9859153
heather -5.82365
success 12.691189
signific -6.309644
pandemiccovid19 3.6164627
schumer 34.168423
metal 27.123547
touch 18.728933
underscor 8.796048
known -32.88112
sideeffect -5.6260023
hrefhttpstcogpehoheem6pictwittercomgpehoheem6amdashicahnmountsinai -8.377054
continu 3.2851818
evict 28.703934
look -13.939879
longterm 0.16669647
rush -12.778448
guidelin 2.6124458
develop -10.575764
care 2.903183
across -2.1147811
defer -0.49456954
vote 32.918488
anywher 0.4651918
880page -2.5608847
day 23.556643
mean -35.344666
seattl 20.575691
technolog -11.521113
stream -17.302238
offer 11.357911
two -17.67875
age 33.14762
run 32.636024
immun -14.33609
sell 22.773413
board 6.589963
attend -8.9534855
without 18.203278
rep 22.762814
passag -29.84073
<Figure size 640x480 with 1 Axes>
1
​


## Machine learning

The data we have is unsupervised.therefore, we employed the unsupervised Machine Learning Type. 
Deriving the structure from data where we dont know the effect of any of the variables. 

Based on the content of the text, grouping the texts together in distinct folders. 
* Like cbc news text : in one folder.
* Associated Press News text : in second folder
* freelancer news : In 3rd Folder.

##### Unsupervised Learning Technique on News.csv
If the model can identify that these 30 to 40 texts are in regard to symptoms of COVID
and other ones are in regard to economic effects of COVID in certain regions and then we can group them into certain
bundles.

For classification Problems: we use three main performance metrics.
    1. Accuracy = No.of predicted correctly/Total number of observations
    2. Precision = No. of predicted as cbc news that are actually cbc/total no. predicted as cbc
    3. Recall = No. of predicted as cbc news that are actually cbc/ total no. that are actually cbc 

## RandomForestClassifier Attributes & Hyperparameters

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0

### RandomForestClassifier through Cross-Validation

In [45]:
from sklearn.model_selection import KFold, cross_val_score

In [46]:
rf = RandomForestClassifier(n_jobs=-1)
# n_jobs=-1 : By building indiviual decision trees to run in parallel.

In [47]:
k_fold = KFold(n_splits=5)
# Hyperparameter n_splits = so there are 5 subsets. 
# In 1st itertion model will train on first 4 and evaluate the 5th subset.
# In 2nd itertion model will train on 2,3,4,5 and evaluate the 1st subset and so on. ....

In [48]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat...","[canadian, pharmaci, limit, much, medic, dispe..."
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w...","[yukon, govern, identifi, two, place, whitehor..."
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled...","[senat, late, wednesday, pass, unparallel, 22,..."
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ...","[scientists, around, world, racing, find, nove...","[scientist, around, world, race, find, novel, ..."
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ...","[, trudeau, says, rules, quarantine, act, enfo...","[, trudeau, say, rule, quarantin, act, enforc,..."


In [53]:
# https://www.kaggle.com/washingtongold/fake-vs-real-news/data
# https://www.kaggle.com/ryanxjhan/cbc-news-coronavirus-articles-march-26

true = pd.read_csv('True.csv')
false = pd.read_csv('Fake.csv')

In [52]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [54]:
false.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [55]:
true['true'] = 1
false['true'] = 0

In [56]:
true.head()

Unnamed: 0,title,text,subject,date,true
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [57]:
data = pd.concat([true,false])

In [60]:
import re
def preprocess_text(sentence):
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence.lower()

In [63]:
data['cleaned'] = data['text'].apply(preprocess_text)

In [65]:
corona_news = pd.read_csv('news.csv')

In [67]:
corona_news['cleaned'] = corona_news['text'].apply(preprocess_text)

In [69]:
list1 = ''.join(data['cleaned'].tolist()).split(' ')

In [1]:
#list1
# So this list has the text of true and false news i.e the real news and fake news.