In [170]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [171]:
import pandas as pd
import numpy as np
import os
path=os.getcwd()

In [172]:
headlines=[]
for filename in os.listdir(path+"/drive/MyDrive"):
  if "Articles" in filename:
    article_df=pd.read_csv(path+'/drive/MyDrive/' +filename)
    headlines.extend(list(article_df['headline'].values))

In [173]:
len(headlines)

8699

In [174]:
headlines=[h for h in headlines if h!="Unknown"]
print("The number of headline is:",len(headlines))

The number of headline is: 8027


In [175]:
import string
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore',category=FutureWarning)

In [176]:
def clean_text(headline):
  text="".join(word for word in headline if word not in string.punctuation).lower()
  text=text.encode("utf8").decode("ascii","ignore")
  return text

corpus=[clean_text(headline) for headline in headlines]

In [177]:
corpus

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted',
 'the caged beast awakens',
 'an everunfolding story',
 'oreilly thrives as settlements add up',
 'mouse infestation',
 'divide in gop now threatens trump tax plan',
 'variety puzzle acrostic',
 'they can hit a ball 400 feet but play catch thats tricky',
 'in trump country shock at trump budget cuts',
 'why is this hate different from all other hate',
 'pick your favorite ethical offender',
 'my sons growing black pride',
 'jerks and the startups they ruin',
 'trump  needs  a brain',
 'manhood in the age of trump',
 'the value of a black college',
 'initial description',
 'rough estimates',
 'el pasatiempo nacional',
 'cooling off on a hot day at yankee stadium',
 'trumps staff mixed politics and paydays',
 'a virtuoso rebuilding act requires everyone in tune',
 'homeland seaso

In [178]:
vocab=[]
for line in corpus:
  words=line.split()
  for word in words:
    vocab.append(word)

vocabulary=set(vocab)

In [179]:
len(vocabulary)

10778

In [180]:
!pip install keras_preprocessing



In [181]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)
word2index=tokenizer.word_index
len(word2index)

10778

In [182]:
word2index

{'the': 1,
 'a': 2,
 'to': 3,
 'of': 4,
 'in': 5,
 'for': 6,
 'and': 7,
 'on': 8,
 'is': 9,
 'trump': 10,
 'with': 11,
 'new': 12,
 'at': 13,
 'how': 14,
 'what': 15,
 'you': 16,
 'an': 17,
 'from': 18,
 'trumps': 19,
 'as': 20,
 'it': 21,
 'its': 22,
 'are': 23,
 'your': 24,
 'not': 25,
 'be': 26,
 'us': 27,
 'season': 28,
 'that': 29,
 'by': 30,
 'but': 31,
 'about': 32,
 'can': 33,
 'episode': 34,
 'do': 35,
 'york': 36,
 'when': 37,
 'up': 38,
 'this': 39,
 'why': 40,
 'over': 41,
 'no': 42,
 'i': 43,
 'out': 44,
 'more': 45,
 'my': 46,
 'after': 47,
 'will': 48,
 'may': 49,
 'health': 50,
 'we': 51,
 'or': 52,
 'war': 53,
 'questions': 54,
 'his': 55,
 'who': 56,
 'today': 57,
 'now': 58,
 'president': 59,
 'house': 60,
 'should': 61,
 'teaching': 62,
 'have': 63,
 'get': 64,
 'one': 65,
 'was': 66,
 'into': 67,
 'all': 68,
 'dont': 69,
 'home': 70,
 'good': 71,
 'plan': 72,
 'life': 73,
 'our': 74,
 'gop': 75,
 'has': 76,
 'says': 77,
 'like': 78,
 'first': 79,
 'white': 80,
 'tr

In [183]:
dictionary={}
rev_dictionary={}
for word,idx in word2index.items():
  dictionary[word]=idx
  rev_dictionary[idx]=word

In [184]:
input_sequences=tokenizer.texts_to_sequences(corpus)

In [185]:
input_sequences

[[354, 17, 4909, 515, 4, 2, 2340, 126, 5, 1826],
 [7, 58, 1, 4910, 10, 4911],
 [3181, 4912, 67, 3182],
 [2341, 4913, 1827, 605, 405],
 [261, 355, 6, 2342],
 [1, 4914, 2343, 2344],
 [17, 4915, 187],
 [1267, 3183, 20, 3184, 743, 38],
 [4916, 4917],
 [841, 5, 75, 58, 1828, 10, 132, 72],
 [120, 356, 326],
 [104, 33, 842, 2, 2345, 4918, 1490, 31, 302, 2346, 481, 1829],
 [5, 10, 382, 1073, 13, 10, 303, 357],
 [40, 9, 39, 437, 606, 18, 68, 272, 437],
 [482, 24, 383, 2347, 1830],
 [46, 1268, 941, 105, 1269],
 [4919, 7, 1, 3185, 104, 1270],
 [10, 358, 2, 555],
 [3186, 5, 1, 238, 4, 10],
 [1, 1074, 4, 2, 105, 221],
 [4920, 3187],
 [1491, 3188],
 [1492, 4921, 4922],
 [3189, 135, 8, 2, 556, 127, 13, 4923, 3190],
 [19, 942, 1831, 203, 7, 3191],
 [2, 4924, 3192, 327, 4925, 673, 5, 843],
 [516, 28, 123, 34, 943, 9, 1832, 106, 2, 1493, 1833],
 [89, 164, 944, 7, 1, 239, 4, 1834],
 [2348, 2, 4926],
 [483, 1075, 7, 1271, 34, 149, 261, 1, 945],
 [359, 28, 90, 34, 222, 3193, 9, 71, 1835, 37, 22, 25],
 [124

In [186]:
input_data=[]
target=[]
for line in input_sequences:
  for i in range(1,len(line)-1):
    input_data.append(line[:i])
    target.append(line[i])

In [187]:
input_data

[[354],
 [354, 17],
 [354, 17, 4909],
 [354, 17, 4909, 515],
 [354, 17, 4909, 515, 4],
 [354, 17, 4909, 515, 4, 2],
 [354, 17, 4909, 515, 4, 2, 2340],
 [354, 17, 4909, 515, 4, 2, 2340, 126],
 [7],
 [7, 58],
 [7, 58, 1],
 [7, 58, 1, 4910],
 [3181],
 [3181, 4912],
 [2341],
 [2341, 4913],
 [2341, 4913, 1827],
 [261],
 [261, 355],
 [1],
 [1, 4914],
 [17],
 [1267],
 [1267, 3183],
 [1267, 3183, 20],
 [1267, 3183, 20, 3184],
 [841],
 [841, 5],
 [841, 5, 75],
 [841, 5, 75, 58],
 [841, 5, 75, 58, 1828],
 [841, 5, 75, 58, 1828, 10],
 [120],
 [104],
 [104, 33],
 [104, 33, 842],
 [104, 33, 842, 2],
 [104, 33, 842, 2, 2345],
 [104, 33, 842, 2, 2345, 4918],
 [104, 33, 842, 2, 2345, 4918, 1490],
 [104, 33, 842, 2, 2345, 4918, 1490, 31],
 [104, 33, 842, 2, 2345, 4918, 1490, 31, 302],
 [104, 33, 842, 2, 2345, 4918, 1490, 31, 302, 2346],
 [5],
 [5, 10],
 [5, 10, 382],
 [5, 10, 382, 1073],
 [5, 10, 382, 1073, 13],
 [5, 10, 382, 1073, 13, 10],
 [40],
 [40, 9],
 [40, 9, 39],
 [40, 9, 39, 437],
 [40, 9, 39,

In [188]:
input_data[1]

[354, 17]

In [189]:
target[1]

4909

In [190]:
len(input_data)

40113

In [191]:
MAX_LEN=-0
for seq in input_data:
  if len(seq)>MAX_LEN:
    MAX_LEN=len(seq)

MAX_LEN

22

In [192]:
input_data=pad_sequences(input_data,maxlen=MAX_LEN,padding='post')

In [193]:
input_data

array([[ 354,    0,    0, ...,    0,    0,    0],
       [ 354,   17,    0, ...,    0,    0,    0],
       [ 354,   17, 4909, ...,    0,    0,    0],
       ...,
       [1668,    4,    2, ...,    0,    0,    0],
       [  17,    0,    0, ...,    0,    0,    0],
       [  17, 4867,    0, ...,    0,    0,    0]], dtype=int32)

In [194]:
input_data.shape

(40113, 22)

In [195]:
from keras.utils import to_categorical

In [196]:
target=to_categorical(target)

In [197]:
target.shape

(40113, 10779)

In [198]:
target

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [199]:
from keras.models import Sequential
from keras.layers import Embedding,GRU,Dense,Dropout,LSTM
from keras.callbacks import EarlyStopping

In [200]:
model=Sequential()

In [201]:
model.add(Embedding(input_dim=len(vocabulary),output_dim=100,input_length=MAX_LEN))
model.add(LSTM(units=100))

model.add(Dense(units=target.shape[1],activation="softmax"))

In [202]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['accuracy'])

In [203]:
model.fit(input_data,target,batch_size=5,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7bb93e369270>

In [204]:
new_sent='We need'

In [205]:
corpus=[clean_text(headline) for headline in new_sent]
corpus

['w', 'e', ' ', 'n', 'e', 'e', 'd']

In [206]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)
word2index=tokenizer.word_index
len(word2index)

4

In [207]:
input_sequences=tokenizer.texts_to_sequences(corpus)
input_sequences

[[2], [1], [], [3], [1], [1], [4]]

In [208]:
input_data=pad_sequences(input_sequences,maxlen=MAX_LEN,padding='post')
input_data

array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)

In [209]:
result=model.predict(input_data)
result



array([[1.2593668e-11, 4.2042825e-06, 5.3603839e-05, ..., 1.4011384e-04,
        4.2155442e-08, 6.1024180e-10],
       [2.0417442e-11, 2.9606765e-06, 1.6049686e-05, ..., 3.7305388e-05,
        2.5761404e-09, 2.1786470e-11],
       [1.6107909e-12, 2.6371719e-03, 2.7416495e-03, ..., 1.0866232e-07,
        4.3164105e-06, 1.1069592e-06],
       ...,
       [2.0417442e-11, 2.9606765e-06, 1.6049686e-05, ..., 3.7305388e-05,
        2.5761404e-09, 2.1786470e-11],
       [2.0417442e-11, 2.9606765e-06, 1.6049686e-05, ..., 3.7305388e-05,
        2.5761404e-09, 2.1786470e-11],
       [5.2899195e-11, 8.8500418e-03, 1.4767185e-01, ..., 4.4910749e-08,
        7.8824314e-07, 9.1060183e-06]], dtype=float32)

In [210]:
import numpy as np
answer=np.argmax(input_data)

In [211]:
rev_dictionary[answer]

'tax'