<a href="https://colab.research.google.com/github/Prathamesh-Chavan-98/-Next-Word-Prediction-Using-LSTM-on-News-Headlines/blob/main/Next_Word_Prediction_Using_LSTM_on_News_Headlines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'us-crime-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2832232%2F4884532%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240611%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240611T103807Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D132d23869fe838e9ec8f57d66ebe2c6dfe99caf2d1f5e7a82424f01f6b666e1a3b5e8b3507f9327750f314037861d29d9456a795163ebf175dfa9f848e067f754ea7f05fadf6ecd630c66ca0a768374b01538b742b54a565c67fc66c3b484dac55ca8119acb22f3ef3890b6b064b4f343404df30600bbb8023587609be36a105e68346a3816a4ead645dfda63bf1fca30080d2e7853ebecd554bff6af350a0ffe8da8e50db90483503f934a13950f23b42ab6dcbbaa136ca97b4c796ee740eedc3b92c3f27b6a7fad2899c9fdadf7f0ae32e2e3c46818119c5b94521d3a9a32604ba2dd2a635e5837872d511a7a9396ede8e6bbcbf9c190e30be9f9c92c15f8e'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import re


In [None]:
data = pd.read_csv('/kaggle/input/us-crime-data/US_Crime_Data.csv')
data.sample(10)

In [None]:
data.isna().sum()

## For this model we just need headlines

In [None]:
df = data[['Title']]
df = df.dropna().reset_index(drop = True)
df.head()

In [None]:
df['Title'][123]

In [None]:
df.shape

# Removing Punctuations

In [None]:
def clean_text(df):
    df['Title'] = df['Title'].apply(lambda x : x.lower())
    tokens = df['Title'].str.replace('[{}]'.format(string.punctuation), '')
    return tokens

In [None]:
tokens = clean_text(df)

In [None]:
tokens

In [None]:
len(set(tokens))

# Tokenizing

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tokens)
seq = tokenizer.texts_to_sequences(tokens)

In [None]:
seq[:10]

# Creating input and output data list

In [None]:
x = []
y = []
total_words_drop = 0
for i in seq:
    if len(i) > 1:
        for j in range(1, len(i)):
            x.append(i[:j])
            y.append(i[j])

    else :
        total_words_drop +=1
print('Total Words Dropped : {}'.format(total_words_drop))

In [None]:
y[: 10]

# Padding sequences

In [None]:
x = tf.keras.preprocessing.sequence.pad_sequences(x)

In [None]:
x.shape

# Shaping y same as x

In [None]:
y = tf.keras.utils.to_categorical(y)

In [None]:
y.shape

# Vocab Size : total no. of unique words

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

# LSTM Model

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,49 ),
                            tf.keras.layers.LSTM(100, return_sequences = True),
                            tf.keras.layers.LSTM(100),
                            tf.keras.layers.Dense(100, activation = 'relu'),
                            tf.keras.layers.Dense(vocab_size, activation = 'softmax')])

In [None]:
model.summary()

In [None]:
model.compile(loss  = 'categorical_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'],
             )

In [None]:
history = model.fit(x,y,
                   epochs = 100,
                    batch_size = 256,
                    callbacks = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss',
                                                               patience = 5,
                                                               restore_best_weights = True))

# Saving model

In [None]:
model.save('model.h5')

# Vocab Array : list of all the unique words

In [None]:
vocab_array = np.array(list(tokenizer.word_index.keys()))
vocab_array

# Final Function for Predictions

In [None]:
def make_predictions(text, n_words):
    for i in range(n_words):
        text_tokenize = tokenizer.texts_to_sequences([text])
        text_padding  = tf.keras.preprocessing.sequence.pad_sequences(text_tokenize, maxlen = 49)
        prediction = np.squeeze(np.argmax(model.predict(text_padding), axis = -1))
        prediction = str(vocab_array[prediction - 1])
        text += " " + prediction
    return text

# Testing Model

In [None]:
make_predictions('california',5)

In [None]:
make_predictions('new york',8)

In [None]:
make_predictions('highway',8)

> Its so fun to get prediction !

# Thanks!