In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from tensorflow import keras
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.layers import Embedding
pd.set_option('display.max_colwidth', -1)

<h1> Assesing Data Quality </h1>

In [None]:
dataset = pd.read_json('News_Category_Dataset_v2.json', lines=True)
dataset.drop(['authors','link','date'], axis=1, inplace=True)
dataset.head(
)

In [None]:
dataset.info()

In [None]:
dataset.describe()

In [None]:
print("Unique Categories : ".format(dataset['category'].nunique()))
dataset['category'].value_counts()

In [None]:
fig = plt.figure(figsize=(20,20))
plt.pie(dataset['category'].value_counts().values, labels=dataset['category'].value_counts().index, autopct='%1.1f%%')

<h3> Grouping the Categories</h3>

In [None]:
categories = dataset['category'].value_counts().index

def groups(grouplist, name):
    for element in categories:
        if element in grouplist:
            dataset.loc[dataset['category']==element, 'category']=name

In [None]:
groups(grouplist=['WELLNESS', 'HEALTHY LIVING' , 'HOME & LIVING', 'STYLE & BEAUTY', 'STYLE'] , name='LIFESTYLE AND WELLNESS')

groups(grouplist=['PARENTING', 'PARENTS', 'EDUCATION', 'COLLEGE'] , name='PARENTING AND EDUCATION')

groups(grouplist=['SPORTS', 'ENTERTAINMENT', 'COMEDY', 'WEIRD NEWS', 'ARTS'] , name='SPORTS AND ENTERTAINMENT')

groups(grouplist=['TRAVEL', 'ARTS & CULTURE','CULTURE & ARTS','FOOD & DRINK', 'TASTE'] , name='TRAVEL-TOURISM & ART-CULTURE')

groups(grouplist=['WOMEN','QUEER VOICES', 'LATINO VOICES', 'BLACK VOICES'] , name='EMPOWERED VOICES')

groups(grouplist=['BUSINESS' ,  'MONEY'] , name='BUSINESS-MONEY')

groups(grouplist=['THE WORLDPOST' , 'WORLDPOST' , 'WORLD NEWS'] , name='WORLDNEWS')

groups(grouplist=['ENVIRONMENT' ,'GREEN'] , name='ENVIRONMENT')

groups(grouplist=['TECH', 'SCIENCE'] , name='SCIENCE AND TECH')

groups(grouplist=['FIFTY' , 'IMPACT' ,'GOOD NEWS','CRIME'] , name='GENERAL')
groups(grouplist=['WEDDINGS', 'DIVORCE',  'RELIGION','MEDIA'] , name='MISC')

In [None]:
print("The revised Categories are : ".format(dataset['category'].nunique()))
dataset['category'].value_counts()

In [None]:
fig = plt.figure(figsize=(20,20))
plt.pie(dataset['category'].value_counts().values, labels=dataset['category'].value_counts().index, autopct='%1.1f%%')

<h3> Removing empty values and duplicates

In [None]:
df = dataset.copy() # creating a copy of the dataset

In [None]:
df.duplicated().sum()  # total duplicates

In [None]:
df.drop_duplicates(keep='last', inplace=True)

In [None]:
df.duplicated(subset=['short_description', 'headline']).sum()

In [None]:
df.drop_duplicates(subset=['short_description', 'headline'], keep='last', inplace=True)

In [None]:
print(len(df[df['headline'] == ""]))

In [None]:
df.loc[df['headline'] == "", 'headline'] =np.nan
df.dropna(subset=['headline'], inplace=True)
print(len(df[df['headline'] == ""]))

In [None]:
print(len(df[df['short_description'] == ""]))

In [None]:
df.loc[df['short_description'] == "", 'short_description'] = np.nan
df.dropna(subset=['short_description'], inplace=True)
print(len(df[df['short_description'] == ""]))

<h3> Data Tidying </h3>

In [None]:
from sklearn.utils import shuffle
df =shuffle(df)
df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

In [None]:
df['desc'] = df['headline'].astype(str)+"-"+df['short_description']
df.drop(columns =['headline', 'short_description'], axis=1, inplace=True)
df.astype(str)
df.head()

<h2> Tokenizing and Padding </h2>

In [None]:
X,Y =df['desc'], df['category']

#Dividing our data as the following:
# Train data : 80%
# Test data : 10%
# Validation data : 10%

X_train, X_val, y_train, y_val = train_test_split(X,Y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [None]:
vocab_size =20000
max_length =150
trunc_type = 'post'
padding_type = 'post'
oov_tok="<OOV>"

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index=tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen = max_length, padding = padding_type, truncating = trunc_type)
y_train = np.asarray(y_train)
y_train = pd.get_dummies(y_train)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen = max_length, padding = padding_type, truncating = trunc_type)
y_val = np.asarray(y_val)
y_val = pd.get_dummies(y_val)

train_set = np.array(X_train)
val_set = np.array(X_val)

train_label = np.array(y_train)
val_label = np.array(y_val)

y_test = pd.get_dummies(y_test)
y_test = np.asarray(y_test)
y_test = np.argmax(y_test, axis=1)

print(train_set.shape)
print(train_label.shape)
print(val_set.shape)
print(val_label.shape)


<h2> Embedding Matrix for our Model </h2>

In [None]:
num_tokens = len(tokenizer.word_index.items()) + 2
embedding_dim = 100
hits = 0
misses = 0

embeddings_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit = 1)
        coefs = np.fromstring(coefs, "f", sep = " ")
        embeddings_index[word] = coefs
        
print("Found %s word vectors." % len(embeddings_index))


<h6> Preparing Embedding Matrix </h6>

In [None]:
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits,misses))

<h2> Training our Model </h2>

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 3, min_delta = 0.0001)
tf.keras.backend.clear_session()
embed_size = 100
model = keras.models.Sequential([
    Embedding(num_tokens, embedding_dim, embeddings_initializer = keras.initializers.Constant(embedding_matrix), mask_zero = True, input_shape = [None], trainable =False),
    keras.layers.Bidirectional(keras.layers.LSTM(256, dropout = 0.4)),
    keras.layers.Dense(12, activation="softmax")
])

model.summary()

In [None]:
opt = keras.optimizers.Adam(learning_rate = 0.001)
model.compile(loss = "categorical_crossentropy", optimizer = opt, metrics = ["accuracy"])

history = model.fit(train_set, train_label, batch_size = 32, steps_per_epoch = len(X_train) // 32, validation_data = (val_set, val_label), validation_steps = len(val_set) // 32 , epochs =20, callbacks = early_stop)

<h2> Evaluating and Making Predictions

In [None]:
classes = dataset['category'].value_counts().index

def prediction(inference_data) :
    X = tokenizer.texts_to_sequences(inference_data)
    X = pad_sequences(X, maxlen = max_length, padding = padding_type, truncating = trunc_type)
    pred = model.predict(X)
    pred_value = tf.argmax(pred, axis = 1).numpy()
    return pred_value

In [None]:
y_pred = prediction(X_test)
print(classification_report(np.asarray(y_test), np.asarray(y_pred)))

print(confusion_matrix(y_test, y_pred))