### Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('news.csv', delimiter =',')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,authors,title,publish_date,description,text,url
0,0,['Cbc News'],Coronavirus a 'wake-up call' for Canada's pres...,2020-03-27 08:00:00,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,https://www.cbc.ca/news/health/covid-19-drug-s...
1,1,['Cbc News'],Yukon gov't names 2 possible sources of corona...,2020-03-27 01:45:00,The Yukon government has identified two places...,The Yukon government has identified two places...,https://www.cbc.ca/news/canada/north/yukon-cor...
2,2,['The Associated Press'],U.S. Senate passes $2T coronavirus relief package,2020-03-26 05:13:00,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,https://www.cbc.ca/news/world/senate-coronavir...
3,3,['Cbc News'],Coronavirus: The latest in drug treatment and ...,2020-03-27 00:36:00,Scientists around the world are racing to find...,Scientists around the world are racing to find...,https://www.cbc.ca/news/health/coronavirus-tre...
4,4,['Cbc News'],The latest on the coronavirus outbreak for Mar...,2020-03-26 20:57:00,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,https://www.cbc.ca/news/the-latest-on-the-coro...


In [4]:
df.shape

(3566, 7)

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,3566.0
mean,2455.649748
std,1298.52945
min,0.0
25%,1473.25
50%,2496.5
75%,3569.75
max,4608.0


#### Dropping unwanted columns

In [6]:
df.drop(["Unnamed: 0",'publish_date','url'], axis = 1, inplace = True)

In [7]:
df.head()

Unnamed: 0,authors,title,description,text
0,['Cbc News'],Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,['Cbc News'],Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,['The Associated Press'],U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,['Cbc News'],Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,['Cbc News'],The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


In [12]:
df['authors'] = df['authors'].str.strip('[]')
df['authors'] = df['authors'].str.strip('  ''')
df['authors'] = df.authors.str.replace("[({':]", "")
df['authors'] = df['authors'].str.lower()

In [13]:
df.head()

Unnamed: 0,authors,title,description,text
0,cbc news,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,cbc news,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,the associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,cbc news,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,cbc news,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


In [10]:
df['authors'].nunique()

261

In [11]:
df['authors'].unique()

array(['Cbc News', 'The Associated Press',
       'Mark Gollom Is A Toronto-Based Reporter With Cbc News. He Covers Canadian, U.S. Politics, Current Affairs.',
       'Thomson Reuters',
       'Leah Hendry Is A Tv, Radio, Online Journalist With Cbc Montreal Investigates. Contact Her Via Our Confidential Tipline, Or On Email At Montrealinvestigates Cbc.Ca., Follow Leah On Twitter',
       'Reporter, Web Writer For Cbc Saskatoon, Story Tips, Ideas Welcomed At Guy.Quenneville Cbc.Ca, Follow Guy On Twitter',
       'Jorge Barrera Is A Caracas-Born, Award-Winning Journalist Who Has Worked Across The Country, "Internationally. He Works For CbcS Indigenous Unit Based Out Of Ottawa. Follow Him On Twitter", Jorgebarrera Or Email Him Jorge.Barrera Cbc.Ca., Follow, Jorgebarrera On Twitter',
       'Colleen M. Flood Is Director Of The Centre For Health Law, Policy, Ethics, University Research Chair At The University Of Ottawa.',
       'Producer, Cbc News Business, James Dunne Researches, Produces

#### Cleaning Authors Column

In [33]:
import re

df['authors'].replace(to_replace = [r'cbcs?\b.*',r'.*\bcbcs?', r'.*cbcnews.*'], value='cbc', regex=True, inplace=True)
df['authors'].replace(to_replace = ['the associated press'], value='associated press', inplace=True)
df['authors'].replace(to_replace = [r'canadian?\b.*',r'.*\bcanadian?'], value='canadian', regex=True, inplace=True)
df['authors'].replace(to_replace = [r'freelancer?\b.*',r'.*\bfreelancer?'], value='freelancer', regex=True, inplace=True)


In [34]:
df['authors']

0                                                     cbc
1                                                     cbc
2                                        associated press
3                                                     cbc
4                                                     cbc
5                                                     cbc
6                                                     cbc
7                                        associated press
8                                         thomson reuters
9                                                     cbc
10                                                    cbc
11                                                    cbc
12      colleen m. flood is director of the centre for...
13                                                    cbc
14                                                    cbc
15                                                    cbc
16                                                       
17            

In [35]:
df['authors'].nunique()

36

## Explore the dataset

In [37]:
# What is the shape of the dataset?

print("Input data has {} rows and {} columns".format(len(df), len(df.columns)))

Input data has 3566 rows and 4 columns


In [38]:
# How many News Authors are there? Considering Authors as Label.

print("Out of {} rows, {} are CBC News, {} are Associated Press".format(len(df),
                                                       len(df[df['authors']=='Cbc News']),
                                                       len(df[df['authors']=='The Associated Press'])))

Out of 3566 rows, 0 are CBC News, 0 are Associated Press


In [39]:
# How much missing data is there?

print("Number of null in label: {}".format(df['authors'].isnull().sum()))
print("Number of null in text: {}".format(df['text'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


## NLP Basics: Implementing a pipeline to clean text

### Pre-processing text data

Cleaning up the text data is necessary to highlight attributes. These would be loaded to machine learning system to pick up on. Cleaning (or pre-processing) the data typically consists of a number of steps:
1. **Remove punctuation**
2. **Tokenization**
3. **Remove stopwords**
4. **Lemmatize/Stem**

In [40]:
df.head()

Unnamed: 0,authors,title,description,text
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...


### Remove punctuation

In [41]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [42]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

In [43]:
df['body_text_clean'] = df['text'].apply(lambda x: remove_punct(x))

In [44]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...


### Tokenization

In [45]:
import re

In [46]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [47]:
df['body_text_tokenized'] = df['body_text_clean'].apply(lambda x: tokenize(x.lower()))

In [48]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc..."
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,..."
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp..."
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ..."
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ..."


### Remove stopwords

In [49]:
import nltk

#from nltk.corpus import stopwords
#stopwords.words('english')

In [50]:
stopword = nltk.corpus.stopwords.words('english')

In [25]:
stopword

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [51]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

In [52]:
df['body_text_nostop'] = df['body_text_tokenized'].apply(lambda x: remove_stopwords(x))

In [53]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat..."
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w..."
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled..."
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ...","[scientists, around, world, racing, find, nove..."
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ...","[, trudeau, says, rules, quarantine, act, enfo..."


## Supplemental Data Cleaning: Using Stemming

In [54]:
import nltk

ps = nltk.PorterStemmer()   # Test out Porter stemmer

In [65]:
# dir(ps)

* Examples

In [66]:
#print(ps.stem('grows'))
#print(ps.stem('growing'))
#print(ps.stem('grow'))

In [67]:
#print(ps.stem('run'))
#print(ps.stem('running'))
#print(ps.stem('runner'))

### Stem text

In [58]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [59]:
df['body_text_stemmed'] = df['body_text_nostop'].apply(lambda x: stemming(x))

In [60]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat...","[canadian, pharmaci, limit, much, medic, dispe..."
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w...","[yukon, govern, identifi, two, place, whitehor..."
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled...","[senat, late, wednesday, pass, unparallel, 22,..."
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ...","[scientists, around, world, racing, find, nove...","[scientist, around, world, race, find, novel, ..."
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ...","[, trudeau, says, rules, quarantine, act, enfo...","[, trudeau, say, rule, quarantin, act, enforc,..."


In [63]:
#df.groupby(['authors']).sum()

### Supplemental Data Cleaning: Using a Lemmatizer

In [68]:
# nltk.download()
import nltk

wn = nltk.WordNetLemmatizer()   # https://wordnet.princeton.edu/
ps = nltk.PorterStemmer()

In [None]:
#dir(wn)

* Examples

In [None]:
#print(ps.stem('meanness'))
#print(ps.stem('meaning'))

In [None]:
#print(wn.lemmatize('meanness'))
#print(wn.lemmatize('meaning'))

In [70]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [71]:
df['body_text_lemmatized'] = df['body_text_nostop'].apply(lambda x: lemmatizing(x))

In [72]:
df.head(10)

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat...","[canadian, pharmaci, limit, much, medic, dispe...","[canadian, pharmacy, limiting, much, medicatio..."
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w...","[yukon, govern, identifi, two, place, whitehor...","[yukon, government, identified, two, place, wh..."
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled...","[senat, late, wednesday, pass, unparallel, 22,...","[senate, late, wednesday, passed, unparalleled..."
3,cbc,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ...","[scientists, around, world, racing, find, nove...","[scientist, around, world, race, find, novel, ...","[scientist, around, world, racing, find, novel..."
4,cbc,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ...","[, trudeau, says, rules, quarantine, act, enfo...","[, trudeau, say, rule, quarantin, act, enforc,...","[, trudeau, say, rule, quarantine, act, enforc..."
5,cbc,'Worse' pandemic on horizon unless world deals...,"The continued existence of wildlife markets, w...","The continued existence of wildlife markets, w...",The continued existence of wildlife markets wh...,"[the, continued, existence, of, wildlife, mark...","[continued, existence, wildlife, markets, cons...","[continu, exist, wildlif, market, consid, pote...","[continued, existence, wildlife, market, consi..."
6,cbc,What you need to know about COVID-19 in Ottawa...,CBC Ottawa's latest roundup of key points duri...,Recent developments: Two new cases in western...,Recent developments Two new cases in western ...,"[recent, developments, two, new, cases, in, we...","[recent, developments, two, new, cases, wester...","[recent, develop, two, new, case, western, que...","[recent, development, two, new, case, western,..."
7,associated press,Michigan hospitals jammed as coronavirus cases...,Michigan hospitals are bracing for a surge of ...,Michigan hospitals are bracing for a surge of ...,Michigan hospitals are bracing for a surge of ...,"[michigan, hospitals, are, bracing, for, a, su...","[michigan, hospitals, bracing, surge, coronavi...","[michigan, hospit, brace, surg, coronaviru, ca...","[michigan, hospital, bracing, surge, coronavir..."
8,thomson reuters,U.S. coronavirus cases now highest in the world,The number of confirmed COVID-19 cases in the ...,The number of confirmed COVID-19 cases in the ...,The number of confirmed COVID19 cases in the U...,"[the, number, of, confirmed, covid19, cases, i...","[number, confirmed, covid19, cases, us, rose, ...","[number, confirm, covid19, case, us, rose, 838...","[number, confirmed, covid19, case, u, rose, 83..."
9,cbc,"'Avoid the emergency' pleads Jewish General, a...","Montreal's Jewish General Hospital, one of the...",The Jewish General Hospital plans to set up tr...,The Jewish General Hospital plans to set up tr...,"[the, jewish, general, hospital, plans, to, se...","[jewish, general, hospital, plans, set, traile...","[jewish, gener, hospit, plan, set, trailer, ou...","[jewish, general, hospital, plan, set, trailer..."


### Applying punctuation, Tokenization, Remove stopwords, Lemmatization and Stemming

In [48]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopword]
    return text

In [49]:
df.head()

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,Cbc News,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat...","[canadian, pharmaci, limit, much, medic, dispe...","[canadian, pharmacy, limiting, much, medicatio..."
1,Cbc News,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w...","[yukon, govern, identifi, two, place, whitehor...","[yukon, government, identified, two, place, wh..."
2,The Associated Press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled...","[senat, late, wednesday, pass, unparallel, 22,...","[senate, late, wednesday, passed, unparalleled..."
3,Cbc News,Coronavirus: The latest in drug treatment and ...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,Scientists around the world are racing to find...,"[scientists, around, the, world, are, racing, ...","[scientists, around, world, racing, find, nove...","[scientist, around, world, race, find, novel, ...","[scientist, around, world, racing, find, novel..."
4,Cbc News,The latest on the coronavirus outbreak for Mar...,The latest on the coronavirus outbreak from CB...,Trudeau says rules of Quarantine Act will ...,Trudeau says rules of Quarantine Act will ...,"[, trudeau, says, rules, of, quarantine, act, ...","[, trudeau, says, rules, quarantine, act, enfo...","[, trudeau, say, rule, quarantin, act, enforc,...","[, trudeau, say, rule, quarantine, act, enforc..."


### Vectorizing Raw Data

Implemented 3 types of vectorization Models. They are:

1. count Vectorization
2. Bag-Of-words Model (TF-IDF)
3. N-gram Vectorization or Skip Gram

### Count vectorization 

Creates a document-term matrix where the entry of each cell will be a count of the number of times that word occurred in that document.

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)  # call the function clean_text. 
X_counts = count_vect.fit_transform(df['text'])

# X_counts is the vectorized version of the data.

print(X_counts.shape)
#print(count_vect.get_feature_names())  # get_feature_names() prints all the unique words that are found in all of the text messages.

# (3566, 31399) : 3566 text messages, across this 3566 text messages there are 31399 unique words.
# It means the document term matrix consists of 3566 rows and 31399 columns.

# Each row : Text message and the column will have unique words of this text message.

# get_feature_names() : means the numbers are names of the columns. We can remove the numbers and change it to names.





(3566, 31399)


In [56]:
# Apply CountVectorizer to smaller sample

data_sample = df[0:20]

count_vect_sample = CountVectorizer(analyzer=clean_text)
X_counts_sample = count_vect_sample.fit_transform(data_sample['text'])
print(X_counts_sample.shape)
#print(count_vect_sample.get_feature_names())

# (20, 2951) : 20 rows and 2951 columns.
# There are new feature names. 
# 
# 
print(count_vect_sample)


(20, 2951)
CountVectorizer(analyzer=<function clean_text at 0x1a229fe268>, binary=False,
                decode_error='strict', dtype=<class 'numpy.int64'>,
                encoding='utf-8', input='content', lowercase=True, max_df=1.0,
                max_features=None, min_df=1, ngram_range=(1, 1),
                preprocessor=None, stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None,
                vocabulary=None)


In [None]:
## The raw data output of the count vector is called as the Sparse Vector.


In [57]:
# Vectorizers output sparse matrices
#Sparse Matrix: A matrix in which most entries are 0. In the interest of efficient storage, 
#    a sparse matrix will be stored by only storing the locations of the non-zero elements.

X_counts_sample

<20x2951 sparse matrix of type '<class 'numpy.int64'>'
	with 7096 stored elements in Compressed Sparse Row format>

In [58]:
X_counts_df = pd.DataFrame(X_counts_sample.toarray())
X_counts_df

# This is the document term Matrix. It has 2951 columns and 20 rows. 
# Values are numbered from 0 to 191 in the columns. 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2941,2942,2943,2944,2945,2946,2947,2948,2949,2950
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,1,2,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,1,1,0,1,1,...,1,0,0,0,0,0,1,0,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
X_counts_df.columns = count_vect_sample.get_feature_names()  # Assigning the feature names to the columns
X_counts_df

Unnamed: 0,Unnamed: 1,000,025,026,042,049,057,1,10,100,...,your,youv,yuje,yukon,z,zarychanski,zibi,ziomek,zmiyiwski,zone
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,1,2,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,1,1,0,1,1,...,1,0,0,0,0,0,1,0,1,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Bag Of Words Model

Bag-of-words model:
1.	Count how many times does a word occur in each message (Known as term frequency) (TF)
2.	Weigh the counts, so that frequent tokens get lower weight (inverse document frequency)- (IDF)
3.	Normalize the vectors to unit length, to abstract from the original text length (L2 norm)

In [60]:
len(df)

3566

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
# Might take awhile...
train_data = df[0:1000]
bow_transformer = CountVectorizer(analyzer=clean_text).fit(train_data['text'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

17331


In [65]:
newstest1 = train_data['text'][3]
#print(newstest1)

In [66]:
bowtest1 = bow_transformer.transform([newstest1])
print(bowtest1)
print(bowtest1.shape)

  (0, 172)	1
  (0, 223)	1
  (0, 332)	1
  (0, 368)	1
  (0, 434)	1
  (0, 645)	2
  (0, 706)	1
  (0, 739)	1
  (0, 777)	1
  (0, 1015)	1
  (0, 1263)	1
  (0, 1710)	1
  (0, 1713)	1
  (0, 1722)	1
  (0, 1751)	2
  (0, 1781)	1
  (0, 1793)	1
  (0, 1857)	1
  (0, 1864)	1
  (0, 1939)	1
  (0, 2026)	2
  (0, 2170)	1
  (0, 2190)	1
  (0, 2193)	1
  (0, 2213)	3
  :	:
  (0, 15848)	4
  (0, 15855)	1
  (0, 16075)	1
  (0, 16176)	4
  (0, 16300)	1
  (0, 16303)	1
  (0, 16319)	2
  (0, 16352)	10
  (0, 16433)	1
  (0, 16460)	1
  (0, 16558)	2
  (0, 16609)	1
  (0, 16700)	1
  (0, 16721)	1
  (0, 16743)	2
  (0, 16778)	1
  (0, 16804)	1
  (0, 16870)	2
  (0, 16910)	1
  (0, 16980)	1
  (0, 17011)	1
  (0, 17021)	2
  (0, 17036)	4
  (0, 17203)	1
  (0, 17269)	2
(1, 17331)


In [131]:

#print(bow_transformer.get_feature_names())
#print(bow_transformer.get_feature_names()[9570])

In [67]:
news_bows = bow_transformer.transform(train_data['text'])

In [68]:
print('Shape of Sparse Matrix: ', news_bows.shape)
print('Amount of Non-Zero occurences: ', news_bows.nnz)

Shape of Sparse Matrix:  (1000, 17331)
Amount of Non-Zero occurences:  260806


In [134]:
sparsity = (100.0 * news_bows.nnz / (news_bows.shape[0] * news_bows.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

sparsity: 0


## TF-IDF

In [69]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(news_bows)
tfidf4 = tfidf_transformer.transform(bowtest1)
print(tfidf4)

  (0, 17269)	0.13058072787464675
  (0, 17203)	0.0296249945017847
  (0, 17036)	0.0608861449974719
  (0, 17021)	0.0276023897036252
  (0, 17011)	0.026468178204935924
  (0, 16980)	0.041739948472949276
  (0, 16910)	0.03634719746060934
  (0, 16870)	0.044399747248866445
  (0, 16804)	0.017694428664810868
  (0, 16778)	0.013397644457302273
  (0, 16743)	0.03890585323985886
  (0, 16721)	0.03249554585605943
  (0, 16700)	0.02251463549972901
  (0, 16609)	0.0349787853183564
  (0, 16558)	0.025227955942080483
  (0, 16460)	0.029983397450680554
  (0, 16433)	0.03530198008474523
  (0, 16352)	0.31799164530439683
  (0, 16319)	0.03615701436040493
  (0, 16303)	0.03451465838070459
  (0, 16300)	0.024674975953548815
  (0, 16176)	0.08833841863248833
  (0, 16075)	0.06162151635796847
  (0, 15855)	0.049864965335699736
  (0, 15848)	0.15756705451835676
  :	:
  (0, 2213)	0.16604873998444136
  (0, 2193)	0.020511552193603216
  (0, 2190)	0.017303454619042147
  (0, 2170)	0.033250023634791376
  (0, 2026)	0.024135565049104336


In [70]:
messages_tfidf = tfidf_transformer.transform(news_bows)
print(messages_tfidf.shape)

(1000, 17331)


## Training the model

In [71]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, train_data['authors'])

In [72]:
print('predicted:', spam_detect_model.predict(tfidf4)[0])
print('expected:', train_data.authors[3])

predicted: Cbc News
expected: Cbc News


Created a classification to predict CBC News

### Model Evaluation

In [73]:
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)

['Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' '' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' ''
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' ''
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News' 'Cbc News'
 'Cbc News' 'Cbc News' '' 'Cbc News' 'Cbc News' '' '' '' 'Cbc News'
 'Cbc News' 'Cbc News' 'C

In [74]:
from sklearn.metrics import classification_report
print (classification_report(train_data['authors'], all_predictions))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                            precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 0.80      0.51      0.62       270

  'precision', 'predicted', average, warn_for)


In [75]:
test_data  = df[1001:]

In [76]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

## N-gram Vectorization

In [73]:
from nltk.tokenize.punkt import PunktLanguageVars
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import trigrams

In [74]:
df.head(3)

Unnamed: 0,authors,title,description,text,body_text_clean,body_text_tokenized,body_text_nostop,body_text_stemmed,body_text_lemmatized
0,cbc,Coronavirus a 'wake-up call' for Canada's pres...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,Canadian pharmacies are limiting how much medi...,"[canadian, pharmacies, are, limiting, how, muc...","[canadian, pharmacies, limiting, much, medicat...","[canadian, pharmaci, limit, much, medic, dispe...","[canadian, pharmacy, limiting, much, medicatio..."
1,cbc,Yukon gov't names 2 possible sources of corona...,The Yukon government has identified two places...,The Yukon government has identified two places...,The Yukon government has identified two places...,"[the, yukon, government, has, identified, two,...","[yukon, government, identified, two, places, w...","[yukon, govern, identifi, two, place, whitehor...","[yukon, government, identified, two, place, wh..."
2,associated press,U.S. Senate passes $2T coronavirus relief package,The Senate has passed an unparalleled $2.2 tri...,The Senate late Wednesday passed an unparallel...,The Senate late Wednesday passed an unparallel...,"[the, senate, late, wednesday, passed, an, unp...","[senate, late, wednesday, passed, unparalleled...","[senat, late, wednesday, pass, unparallel, 22,...","[senate, late, wednesday, passed, unparalleled..."


In [79]:
data = df['body_text_clean'].to_string()

In [80]:
no_specials_string = re.sub('[!#?,.:";]', '', data)

In [78]:
#no_specials_string

In [81]:
p=PunktLanguageVars()
tokens=p.word_tokenize(no_specials_string)

In [82]:
# See the last 10 pairs
for i in range(len(tokens)-30, len(tokens)-1):
    print(tokens[i], tokens[i+1])

in h
h 3563
3563 Its
Its a
a relic
relic of
of a
a bygone
bygone era
era when
when wealthy
wealthy Nova
Nova 3564
3564 Camels
Camels have
have been
been implicated
implicated in
in a
a Middle
Middle Easter
Easter 3565
3565 A
A World
World Health
Health Organizationled
Organizationled group
group of
of expert


In [57]:
word_pairs = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
print(len(word_pairs))

gram2 = set(word_pairs)
print(len(gram2))

# Print 20 elements from gram2
gram2_iter = iter(gram2)
print([next(gram2_iter) for i in range(50)])

31854
19823
[('Isobel', 'Mackenzie'), ('and', 'first'), ('d', '206'), ('more', 'p'), ('1472', 'It'), ('Friday', 'Kraft'), ('Windsor', 'announced'), ('diamond', 'mine'), ('toward', 'the'), ('a', 'hug'), ('3361', 'Canadians'), ('Algonquin', 'First'), ('of', 'phishing'), ('Rosa', 'desperately'), ('The', 'Brewer'), ('anchored', 'just'), ('on', '1588'), ('2333', 'Manitobans'), ('fourthquarter', '2764'), ('ongo', '495'), ('Follow', 'the'), ('manslaughter', 'and'), ('ma', '1970'), ('April', 'is'), ('has', 'added'), ('and', 'vaccines'), ('1882', 'Its'), ('pos', '182'), ('stood', 'shoulder'), ('As', 'country'), ('82', 'A'), ('coronavirus', 'pandemic'), ('call', 'centre'), ('declared', 'a'), ('are', 'working'), ('i', '2799'), ('su', '2128'), ('in', 'Greater'), ('provincial', 'telephone'), ('2894', 'Last'), ('its', 'final'), ('COVID', '3210'), ('The', 'death'), ('since', 'an'), ('Halton', 'Region'), ('1651', 'An'), ('Hes', 'a'), ('to', '1420'), ('Some', 'who'), ('PE', '337')]


In [58]:
gram2 = dict()

# Populate 2-gram dictionary
for i in range(len(tokens)-1):
    key = (tokens[i], tokens[i+1])
    if key in gram2:
        gram2[key] += 1
    else:
        gram2[key] = 1

# Turn into a list of (word, count) sorted by count from most to least

gram2 = sorted(gram2, key=lambda count: count)

# Print top 20 most frequent words
print(gram2[:50])

[('0', 'Canadian'), ('1', '983'), ('1', 'The'), ('1', 'a'), ('1', 'milli'), ('10', '870'), ('10', '874'), ('10', 'Read'), ('10', 'Three'), ('10', 'billion'), ('10', 'cities'), ('10', 'days'), ('10', 'metal'), ('10', 'people'), ('100', 'Calgarians'), ('100', 'Ford'), ('100', 'm'), ('100', 'people'), ('1000', 'In'), ('1000', 'a'), ('1000', '—'), ('100000', 'globa'), ('1001', 'Beverly'), ('1002', 'When'), ('1003', 'The'), ('1004', 'Getting'), ('1005', 'Murray'), ('1006', 'For'), ('1007', 'Prime'), ('1008', 'Youve'), ('1009', 'On'), ('101', 'The'), ('1010', 'Music'), ('1011', 'Youve'), ('1012', 'The'), ('1013', 'Ottawa'), ('1014', 'Alberta'), ('1015', 'The'), ('1016', 'The'), ('1017', 'As'), ('1018', 'Gunmen'), ('1019', 'Provincial'), ('102', 'Starbucks'), ('1020', 'BC'), ('1021', 'CALGARY'), ('1022', 'As'), ('1023', 'Premier'), ('1024', 'Starting'), ('1025', 'After'), ('1026', 'Some')]


In [59]:
b=bigrams(tokens)
[x for x in b]

[('0', 'Canadian'),
 ('Canadian', 'pharmacies'),
 ('pharmacies', 'are'),
 ('are', 'limiting'),
 ('limiting', 'how'),
 ('how', 'much'),
 ('much', 'medi'),
 ('medi', '1'),
 ('1', 'The'),
 ('The', 'Yukon'),
 ('Yukon', 'government'),
 ('government', 'has'),
 ('has', 'identified'),
 ('identified', 'two'),
 ('two', 'places'),
 ('places', '2'),
 ('2', 'The'),
 ('The', 'Senate'),
 ('Senate', 'late'),
 ('late', 'Wednesday'),
 ('Wednesday', 'passed'),
 ('passed', 'an'),
 ('an', 'unparallel'),
 ('unparallel', '3'),
 ('3', 'Scientists'),
 ('Scientists', 'around'),
 ('around', 'the'),
 ('the', 'world'),
 ('world', 'are'),
 ('are', 'racing'),
 ('racing', 'to'),
 ('to', 'find'),
 ('find', '4'),
 ('4', 'Trudeau'),
 ('Trudeau', 'says'),
 ('says', 'rules'),
 ('rules', 'of'),
 ('of', 'Quarantine'),
 ('Quarantine', 'Act'),
 ('Act', 'will'),
 ('will', '5'),
 ('5', 'The'),
 ('The', 'continued'),
 ('continued', 'existence'),
 ('existence', 'of'),
 ('of', 'wildlife'),
 ('wildlife', 'markets'),
 ('markets', 'w

In [60]:
t=trigrams(tokens)
[x for x in t]

[('0', 'Canadian', 'pharmacies'),
 ('Canadian', 'pharmacies', 'are'),
 ('pharmacies', 'are', 'limiting'),
 ('are', 'limiting', 'how'),
 ('limiting', 'how', 'much'),
 ('how', 'much', 'medi'),
 ('much', 'medi', '1'),
 ('medi', '1', 'The'),
 ('1', 'The', 'Yukon'),
 ('The', 'Yukon', 'government'),
 ('Yukon', 'government', 'has'),
 ('government', 'has', 'identified'),
 ('has', 'identified', 'two'),
 ('identified', 'two', 'places'),
 ('two', 'places', '2'),
 ('places', '2', 'The'),
 ('2', 'The', 'Senate'),
 ('The', 'Senate', 'late'),
 ('Senate', 'late', 'Wednesday'),
 ('late', 'Wednesday', 'passed'),
 ('Wednesday', 'passed', 'an'),
 ('passed', 'an', 'unparallel'),
 ('an', 'unparallel', '3'),
 ('unparallel', '3', 'Scientists'),
 ('3', 'Scientists', 'around'),
 ('Scientists', 'around', 'the'),
 ('around', 'the', 'world'),
 ('the', 'world', 'are'),
 ('world', 'are', 'racing'),
 ('are', 'racing', 'to'),
 ('racing', 'to', 'find'),
 ('to', 'find', '4'),
 ('find', '4', 'Trudeau'),
 ('4', 'Trudeau',

In [61]:
five_gram=ngrams(tokens,5)
[x for x in five_gram]

[('0', 'Canadian', 'pharmacies', 'are', 'limiting'),
 ('Canadian', 'pharmacies', 'are', 'limiting', 'how'),
 ('pharmacies', 'are', 'limiting', 'how', 'much'),
 ('are', 'limiting', 'how', 'much', 'medi'),
 ('limiting', 'how', 'much', 'medi', '1'),
 ('how', 'much', 'medi', '1', 'The'),
 ('much', 'medi', '1', 'The', 'Yukon'),
 ('medi', '1', 'The', 'Yukon', 'government'),
 ('1', 'The', 'Yukon', 'government', 'has'),
 ('The', 'Yukon', 'government', 'has', 'identified'),
 ('Yukon', 'government', 'has', 'identified', 'two'),
 ('government', 'has', 'identified', 'two', 'places'),
 ('has', 'identified', 'two', 'places', '2'),
 ('identified', 'two', 'places', '2', 'The'),
 ('two', 'places', '2', 'The', 'Senate'),
 ('places', '2', 'The', 'Senate', 'late'),
 ('2', 'The', 'Senate', 'late', 'Wednesday'),
 ('The', 'Senate', 'late', 'Wednesday', 'passed'),
 ('Senate', 'late', 'Wednesday', 'passed', 'an'),
 ('late', 'Wednesday', 'passed', 'an', 'unparallel'),
 ('Wednesday', 'passed', 'an', 'unparallel'

* 1. Prediction of all the wrods that will follow the certain word in search. 
    ** example : "Government has" is entered. then it would display the list of words that follow the co 

* 2. Plotter chart : 

## Install TensorFlow Library Work Module

In [143]:
import tensorflow as tf

In [144]:
var = 0
def countWords(text):
    v = 0
    for i in text:
        v = v + 1
    print(v)
print(var)

0


In [145]:
words = []

In [146]:
# Stemmed vectors are taken as input. 
# Article size: 5 articles are taken.

articles = df.body_text_stemmed.tolist()
for article in articles[0:5]:
    for word in article:
        words.append(word)
        
len(words)

59935

In [147]:
words[1]

'pharmaci'

In [148]:
words = set(words)

In [149]:
word2int = {}
int2word = {}
vocab_size = len(words) # gives the total number of unique words
for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [150]:
print(word2int['pharmaci'])
len(words)

5523


6650

In [151]:
dataTF = []

WINDOW_SIZE = 2

# based on the window size, the neighbouring word or context words is considered for the current word.
# So if the window size is 2, the surrounding words which are at a distance of two words from the current word is processed.

for article in articles[0:5]:
    for word_index, word in enumerate(article):
        for nb_word in article[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(article)) + 1] : 
            if nb_word != word:
                dataTF.append([word, nb_word])

In [152]:
dataTF[0:5]

[['canadian', 'pharmaci'],
 ['canadian', 'limit'],
 ['pharmaci', 'canadian'],
 ['pharmaci', 'limit'],
 ['pharmaci', 'much']]

In [153]:
dataTF[0]

['canadian', 'pharmaci']

In [154]:
#word2int

In [155]:
word2int["canadian"]  # word2int[ data_word[0] ] = word2int["canadian"] = word2int[dataTF[0]]

5879

In [156]:
word2int["pharmaci"]

5523

In [157]:
word2int['covid19']

1490

In [158]:
vocab_size = len(words)
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)     # Initializing the temp array with Zeros till the vocab size. 
    temp[data_point_index] = 1    # assigning one value to the data point index. 
    return temp
x_train = [] # input word
y_train = [] # output word


In [159]:
vocab_size

6650

In [160]:
for data_word in dataTF:
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))
# convert them to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)

In [161]:
# Run this in order to make tensor flow placeholder run on the machine
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [162]:
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

In [163]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [164]:
print(x_train.shape, y_train.shape)

# meaning 9272 training points, where each point has 1101 dimensions
# or 9272 rows and each row has 1101 columns. 

(9272, 6650) (9272, 6650)


In [165]:
EMBEDDING_DIM = 5 # you can choose your own number
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias
hidden_representation = tf.add(tf.matmul(x,W1), b1)

In [166]:
W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

In [None]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) #make sure you do this!
# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))
# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)
n_iters = 10000
# train for n_iter iterations
for _ in range(n_iters):
    sess.run(train_step, feed_dict={x: x_train, y_label: y_train})
    print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train}))

loss is :  13.643272
loss is :  13.299753
loss is :  13.015816
loss is :  12.780971
loss is :  12.586647
loss is :  12.4258175
loss is :  12.292683
loss is :  12.182464
loss is :  12.091197
loss is :  12.015605
loss is :  11.952971
loss is :  11.901046
loss is :  11.857963
loss is :  11.822178
loss is :  11.792411
loss is :  11.767607
loss is :  11.746886
loss is :  11.729528
loss is :  11.71494
loss is :  11.702629
loss is :  11.692192
loss is :  11.683293
loss is :  11.675658
loss is :  11.669065
loss is :  11.663329
loss is :  11.658294
loss is :  11.653835
loss is :  11.649851
loss is :  11.646258
loss is :  11.642989
loss is :  11.639983
loss is :  11.637195
loss is :  11.634585
loss is :  11.632122
loss is :  11.629783
loss is :  11.6275425
loss is :  11.625387
loss is :  11.6233015
loss is :  11.621271
loss is :  11.619291
loss is :  11.617351
loss is :  11.615442
loss is :  11.613567
loss is :  11.611709
loss is :  11.609874
loss is :  11.608054
loss is :  11.606253
loss is :  

loss is :  11.136707
loss is :  11.135594
loss is :  11.134484
loss is :  11.133375
loss is :  11.132265
loss is :  11.13116
loss is :  11.130053
loss is :  11.128946
loss is :  11.127843
loss is :  11.126739
loss is :  11.125637
loss is :  11.124534
loss is :  11.123433
loss is :  11.1223345
loss is :  11.121238
loss is :  11.120138
loss is :  11.11904
loss is :  11.117944
loss is :  11.116849
loss is :  11.115757
loss is :  11.114662
loss is :  11.11357
loss is :  11.112481
loss is :  11.111389
loss is :  11.1103
loss is :  11.109212
loss is :  11.108125
loss is :  11.107038
loss is :  11.10595
loss is :  11.104869
loss is :  11.103782
loss is :  11.1027
loss is :  11.101618
loss is :  11.100537
loss is :  11.099456
loss is :  11.098376
loss is :  11.097298
loss is :  11.09622
loss is :  11.095143
loss is :  11.094069
loss is :  11.092994
loss is :  11.091919
loss is :  11.090847
loss is :  11.089774
loss is :  11.088699
loss is :  11.087632
loss is :  11.086563
loss is :  11.085494


In [None]:
# For Articles = 100, Window Size = 5
# Started at 12.55 AM
# 

In [59]:
print(sess.run(W1))
print('----------')
print(sess.run(b1))
print('----------')

[[ 0.74582875  0.7793943   0.07386164  0.15888414 -1.0550942 ]
 [ 0.98273414  0.0811931   0.0202823   0.3914135  -0.6963007 ]
 [ 0.01152561 -0.315974   -0.21288878  0.15460855  0.8525657 ]
 ...
 [-0.5787782   2.55173    -0.3985736   0.58246446  0.26849777]
 [-0.8640642   0.09151337  0.37342748  0.8094642   0.5006471 ]
 [ 2.7362478  -0.53771824 -1.2462976   1.5780292  -0.5305438 ]]
----------
[0.2118817  0.09580352 0.28877154 0.22681047 0.16917028]
----------


In [60]:
vectors = sess.run(W1 + b1)

In [61]:
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))
def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [92]:
# We will now query these vectors with ‘king’, ‘queen’ and ‘royal’
print(int2word[find_closest(word2int['canadian'], vectors)])
print(int2word[find_closest(word2int['pharmaci'], vectors)])
print(int2word[find_closest(word2int['covid19'], vectors)])

packag
disappoint
follow


In [62]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

In [63]:
from sklearn.manifold import TSNE
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
vectors = model.fit_transform(vectors)

In [64]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
for word in words:
    print(word, vectors[word2int[word]][1])
    ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] ))
plt.show()

 -3.8947728
face -15.572499
anim 1.0498935
analysi -27.109291
market 22.697393
ship 2.409625
1000 16.15856
could 9.423725
critic 33.66415
remot -1.252983
leader 4.5464296
tumult -13.851024
afford -8.124293
entertain -0.89187616
warn -9.086381
forc -7.2234974
cap -25.375395
respiratori -10.305653
toxic 9.459612
10000 33.244785
spong -28.22298
start 24.725822
justin -29.294842
anyth -17.52146
basebal 28.959248
initi 29.42309
addit -10.913372
brief 16.302504
option -16.439035
specialist 28.440205
coronaviru 16.817432
pressur 25.393217
view -2.5587611
ordersanyon -17.755936
six 30.785965
kanopi -1.4266641
dispens 21.57296
deputi 17.981155
equal -1.5556762
blast -10.843041
stress -16.035044
right -19.541338
spiral 18.063726
wish -11.98849
treatment -31.669596
biotech 26.972506
whose -35.80998
alcohol 6.4318314
settl 0.11116588
trick 19.133835
journal 22.785318
suppli 21.73817
work 26.32507
permanent 16.523457
confin 29.671194
break 27.993616
attent -20.050459
appear -5.8178678
600 -9.580401

150000 11.814204
begin -5.1323147
trade -9.847184
sick -3.5123818
cover 31.386969
track -11.5770645
blend 26.81797
mislead 4.4110937
here 22.17565
focus -20.241213
outbreak 3.22317
per 15.683399
base -18.966576
compli 19.06111
14 -16.657051
secur -4.1326256
rheumatoid -36.39707
ill -2.0389738
second 26.279608
cattl 29.944613
whitehors 21.54646
centuryold 8.104972
astronaut -31.792763
broad 28.730446
8 27.615168
4000 5.9088907
unwash -29.632307
includ 15.738359
burden -35.552666
quebec 28.910887
spokesperson 18.724125
prevent -17.98592
april -28.35252
fee 27.535082
deal 5.777824
especi 14.885817
old 29.056107
hurt 10.017166
unclear -1.3694985
request 23.889174
rich 7.2149005
back 33.600864
essenti 29.069126
misinform 10.14035
glove 32.008663
globe 34.785057
encourag -34.277042
parent -3.536217
watch 3.7644677
wasnt -29.145367
expertis 33.435978
prepar 10.283645
despit -0.68040794
herbal 16.936924
propos -13.624949
big -0.78120065
13 21.351345
sanit 2.8905737
famili 7.733819
anyon 31.139

<Figure size 640x480 with 1 Axes>