In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
import re
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.regularizers  import l2
from sklearn.model_selection import train_test_split
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9059173648484755474
]


In [34]:
news = pd.read_csv('Data/news.csv')
print(news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3566 entries, 0 to 3565
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    3566 non-null   int64 
 1   authors       3566 non-null   object
 2   title         3566 non-null   object
 3   publish_date  3566 non-null   object
 4   description   3566 non-null   object
 5   text          3566 non-null   object
 6   url           3566 non-null   object
dtypes: int64(1), object(6)
memory usage: 195.1+ KB
None


In [35]:
tokens = pd.DataFrame(news.text.apply(lambda x: pd.Series(x.split())).stack(), columns=['token'])
tokens.head()

Unnamed: 0,Unnamed: 1,token
0,0,Canadian
0,1,pharmacies
0,2,are
0,3,limiting
0,4,how


In [36]:
tokens['term'] = tokens.token.replace('\W+', '', regex=True).str.lower()
tokens.head()

Unnamed: 0,Unnamed: 1,token,term
0,0,Canadian,canadian
0,1,pharmacies,pharmacies
0,2,are,are
0,3,limiting,limiting
0,4,how,how


In [37]:
term_count = tokens.groupby('term').agg({'term':'count'}).rename(columns={'term':'frequency'}).sort_values(by='frequency',ascending=False)
token_count = tokens.groupby('token').agg({'token':'count'}).rename(columns={'token':'frequency'}).sort_values(by='frequency',ascending=False)

In [38]:
term_count.head()

Unnamed: 0_level_0,frequency
term,Unnamed: 1_level_1
the,134554
to,85549
of,61793
and,59511
in,53590


In [39]:
term_count['term_id'] = range(term_count.shape[0])
token_count['token_id'] = range(token_count.shape[0])

In [40]:
term_count.head()

Unnamed: 0_level_0,frequency,term_id
term,Unnamed: 1_level_1,Unnamed: 2_level_1
the,134554,0
to,85549,1
of,61793,2
and,59511,3
in,53590,4


In [41]:
token_count.head()

Unnamed: 0_level_0,frequency,token_id
token,Unnamed: 1_level_1,Unnamed: 2_level_1
the,117060,0
to,84761,1
of,61476,2
and,57923,3
in,49952,4


In [42]:
def split_tokens(x):
    return(pd.Series(x.split()))

In [43]:
# term_series = news.text.str.lower().to_frame()

In [44]:
# term_series = [x.split() for x in term_series['text']]

In [45]:
# term_series = [[re.sub(r'\W+','',x)for x in y] for y in term_series]

In [46]:
# numerized = [[term_count.loc[x]['term_id'] for x in y] for y in term_series]

In [47]:
# str(numerized[0])[1:-1]

In [48]:
# file = open('Data/news_numerized.txt','w+')
# numerized_str = []
# for i in range(len(numerized)):
#     numerized_str.append(str(numerized[i])[1:-1]+'\n')
# file.writelines(numerized_str)
# file.close()

In [49]:
file = open('Data/news_numerized.txt','r+')
lines = file.readlines()
lines = pd.Series(lines).apply(lambda x: x.split(','))
file.close()

In [50]:
numerized = [[int(x.strip()) for x in y] for y in lines]

In [51]:
# numerized[0]

In [52]:
def create_sequence(x,length):
    i=0
    sequences = []
    while i+length < len(x):
        sequences.append(x[i:i+length])
        i+=1
    return np.matrix(sequences)
        

In [53]:
seq_len = 11
lens = [len(x) for x in numerized]
np.sum(np.less(lens,seq_len))

0

In [54]:
less = np.greater(lens,seq_len)
numerized2 = []
for i in range(len(lens)):
    if less[i]:
        numerized2.append(numerized[i])

In [55]:
len(numerized2)

3566

In [56]:
all_seqs = np.concatenate([create_sequence(x,seq_len) for x in numerized2],axis = 0)

In [57]:
all_seqs.shape

(2528033, 11)

In [58]:
train,test = train_test_split(all_seqs,train_size=60000) 

In [59]:
train_in = train[:,:seq_len-1]
train_out = train[:,1:]
print(train_in.shape)
print(train_out.shape)

test_in = test[:,:seq_len-1]
test_out = test[:,1:]
print(test_in.shape)
print(test_out.shape)

(60000, 10)
(60000, 10)
(2468033, 10)
(2468033, 10)


In [60]:
max_features = term_count.shape[0]
batch_size = 200
text_model = models.Sequential() 
text_model.add(layers.Embedding(max_features,256,batch_input_shape=[batch_size, None]))
text_model.add(layers.GRU(256,return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
text_model.add(layers.Dense(max_features))
text_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (200, None, 256)          10911744  
_________________________________________________________________
gru_1 (GRU)                  (200, None, 256)          393984    
_________________________________________________________________
dense_1 (Dense)              (200, None, 42624)        10954368  
Total params: 22,260,096
Trainable params: 22,260,096
Non-trainable params: 0
_________________________________________________________________


In [61]:
batch_check_input = all_seqs[:batch_size,:seq_len-1]
batch_check_output = all_seqs[:batch_size,1:]
print(batch_check_input.shape)
print(batch_check_output.shape)

(200, 10)
(200, 10)


In [62]:
t = text_model(batch_check_input)

In [63]:
# sampled_indices = tf.random.categorical(t[0], num_samples=1)
# sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
# sampled_indices

In [64]:
def loss(labels,logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

In [65]:
text_model.compile(optimizer='rmsprop', loss=loss)

In [66]:
text_model.load_weights('text_model2.h5')
text_model.reset_states()

In [68]:
history = text_model.fit(train_in, train_out,
                    epochs=10,
                    batch_size=batch_size)

SyntaxError: invalid syntax (<ipython-input-68-62f91e8c054f>, line 3)

In [69]:
text_model.save('text_model3.h5')

In [None]:
print(term_count.loc['trump'])
print(term_count.loc['has'])

In [None]:
in1 = np.ones((1,1))
in2 = np.concatenate([np.ones((200,1))*591,np.ones((200,1))*23,np.zeros((200,1))],axis=1)
in2[0]

In [None]:
# out = text_model(np.matrix([0]))
# preds = tf.squeeze(out,0)
# predicted_id = tf.random.categorical(preds, num_samples=1)[-1,0].numpy()
# term_count.index[predicted_id]

In [71]:
max_features = term_count.shape[0]
batch_size = 1
text_model = models.Sequential() 
text_model.add(layers.Embedding(max_features,256,batch_input_shape=[batch_size, None]))
text_model.add(layers.GRU(256,return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'))
text_model.add(layers.Dense(max_features))
text_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 256)            10911744  
_________________________________________________________________
gru_2 (GRU)                  (1, None, 256)            393984    
_________________________________________________________________
dense_2 (Dense)              (1, None, 42624)          10954368  
Total params: 22,260,096
Trainable params: 22,260,096
Non-trainable params: 0
_________________________________________________________________


In [72]:
def loss(labels,logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

In [73]:
text_model.compile(optimizer='rmsprop', loss=loss)

In [74]:
text_model.load_weights('text_model3.h5')
text_model.reset_states()

In [76]:
text_model.reset_states()
text = ['the']
for i in range(200):
    out = text_model(np.matrix([0]))
    preds = tf.squeeze(out,0)
    predicted_id = tf.random.categorical(preds, num_samples=1)[-1,0].numpy()
    text.append(term_count.index[predicted_id])
print(' '.join(text))

the last executive outbreak official latestcoronavirus jacks sneezes virus coming journal first disease virus calling downtown medicine us brought coast crews bringing recovering described 90s clorox downtown second court kids affected massive risk 55 federal man plant chartered province province today known pass virus 915 coronavirus government virus coast disease primary world chemical middle advice covid19 flu payroll markets first outbreak suspension activities names outbreak company viruss problem risk covid19 secondlargest spreading virus disease cbc spread new epidemic next cities fundraising common outbreak internal most justice continuing risk first fees free aramco severe notice amount peple spread arent causing city canadian virus country middle provinces spread japan purposes director territory company effects spread feeds december time drivers rebound infected country international needs ontario intended funding return outbreak province spread tedros depend virus employee 

In [None]:
sampled_indices = tf.random.categorical(out[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [None]:
# tf.math.argmax(out,axis=2)

In [None]:
for i in range(out.shape[0]):
    si = tf.random.categorical(out[0], num_samples=1)
    words = tf.squeeze(si,axis=-1).numpy()
    print('trump has the {0} {1} {2} \n'.format(term_count.index[words[0]],term_count.index[words[1]],term_count.index[words[2]]))

In [None]:
term_count.index[14229]