In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [117]:
from nltk.corpus import stopwords


##### Data load and cleaning

In [2]:
df = pd.read_csv('../Data/archive/NewsCategorizer.csv')

In [3]:
df.head()

Unnamed: 0,category,headline,links,short_description,keywords
0,WELLNESS,143 Miles in 35 Days: Lessons Learned,https://www.huffingtonpost.com/entry/running-l...,Resting is part of training. I've confirmed wh...,running-lessons
1,WELLNESS,Talking to Yourself: Crazy or Crazy Helpful?,https://www.huffingtonpost.com/entry/talking-t...,Think of talking to yourself as a tool to coac...,talking-to-yourself-crazy
2,WELLNESS,Crenezumab: Trial Will Gauge Whether Alzheimer...,https://www.huffingtonpost.com/entry/crenezuma...,The clock is ticking for the United States to ...,crenezumab-alzheimers-disease-drug
3,WELLNESS,"Oh, What a Difference She Made",https://www.huffingtonpost.com/entry/meaningfu...,"If you want to be busy, keep trying to be perf...",meaningful-life
4,WELLNESS,Green Superfoods,https://www.huffingtonpost.com/entry/green-sup...,"First, the bad news: Soda bread, corned beef a...",green-superfoods


In [4]:
len(df['category'].unique())

10

In [5]:
Y=df[['category']]

In [6]:
Y.value_counts()

category      
BUSINESS          5000
ENTERTAINMENT     5000
FOOD & DRINK      5000
PARENTING         5000
POLITICS          5000
SPORTS            5000
STYLE & BEAUTY    5000
TRAVEL            5000
WELLNESS          5000
WORLD NEWS        5000
dtype: int64

In [7]:
X=df.loc[:,['headline','short_description','keywords']]

In [8]:
X_headline=df[['headline']]
X_description=df[['short_description']]
X_keywords=df[['keywords']]

In [9]:
X_headline.head(1)

Unnamed: 0,headline
0,143 Miles in 35 Days: Lessons Learned


In [10]:
X_description.iloc[0]

short_description    Resting is part of training. I've confirmed wh...
Name: 0, dtype: object

In [11]:
X_keywords.iloc[100]

keywords    shift-perspective
Name: 100, dtype: object

#### use only headline to detect News Category

##### clean and preprocessing the text

In [118]:
import re
from nltk.stem import WordNetLemmatizer

In [45]:
def cleanText(corpus):
    finalCorpus=[]
    lemmatizer = WordNetLemmatizer()
    sWords=list(set(stopwords.words('english')))
    for sen in corpus:
        word=[]
        sen=re.sub(r'[^a-zA-Z\s]', "", sen)
        wordList=sen.split()
        for w in wordList:
            if w not in sWords:
                word.append(lemmatizer.lemmatize(w.lower()))
        finalCorpus.append(" ".join(word))
    return finalCorpus
                
        

In [46]:
cleanedHeadText=cleanText(list(X_headline['headline']))

In [47]:
len(cleanedHeadText)

50000

##### create the vector and deepLearning model

In [49]:
tokenizer=tf.keras.preprocessing.text.Tokenizer(oov_token="OOV")

In [50]:
tokenizer.fit_on_texts(cleanedHeadText)

In [51]:
cleanedHeadSequence=tokenizer.texts_to_sequences(cleanedHeadText)

In [57]:
cleanedHeadPadSequence=tf.keras.preprocessing.sequence.pad_sequences(cleanedHeadSequence,padding="post")

In [58]:
cleanedHeadPadSequence

array([[ 1467,    23,   256, ...,     0,     0,     0],
       [  821,   716,   749, ...,     0,     0,     0],
       [16473,  1126,    42, ...,     0,     0,     0],
       ...,
       [12301, 12302,  7458, ...,     0,     0,     0],
       [   27,  3067,  8096, ...,     0,     0,     0],
       [ 2169,  2694,   369, ...,     0,     0,     0]])

In [70]:
output=LabelEncoder().fit_transform(Y.values.flatten())

In [74]:
Y['category'].unique()

array(['WELLNESS', 'POLITICS', 'ENTERTAINMENT', 'TRAVEL',
       'STYLE & BEAUTY', 'PARENTING', 'FOOD & DRINK', 'WORLD NEWS',
       'BUSINESS', 'SPORTS'], dtype=object)

In [71]:
output[1]

8

In [72]:
train_data, test_data, train_labels, test_labels = train_test_split(cleanedHeadPadSequence,output, test_size=0.2, random_state=0)

In [76]:
vSize=len(tokenizer.word_index)+1

In [79]:
inputL=len(cleanedHeadPadSequence[0])

In [105]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vSize, output_dim=50, input_length=40),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [106]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [107]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [108]:
model.fit(train_data, train_labels, epochs=20, validation_data=(test_data,test_labels),batch_size=128,callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20


<keras.callbacks.History at 0x1a599783580>

In [109]:
model.evaluate(test_data,test_labels)



[0.9566392302513123, 0.7297999858856201]

#### use only short discription to detect News Category

In [110]:
cleanedDisText=cleanText(list(X_description['short_description']))

In [112]:
cleanedDisText[89]

'out participant answered question which following trigger cause sleeplessness'

In [113]:
tokenizerD=tf.keras.preprocessing.text.Tokenizer(oov_token="OOV")

In [114]:
tokenizerD.fit_on_texts(cleanedDisText)

In [115]:
cleanedDisSequence=tokenizerD.texts_to_sequences(cleanedDisText)

In [119]:
cleanedDisPadSequence=tf.keras.preprocessing.sequence.pad_sequences(cleanedDisSequence,padding="post")

In [123]:
train_data, test_data, train_labels, test_labels = train_test_split(cleanedDisPadSequence,output, test_size=0.2, random_state=0)

In [125]:
train_data[0].shape[0]

134

In [126]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizerD.word_index)+1, output_dim=50,input_length=train_data[0].shape[0]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [127]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [128]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [129]:
model.fit(train_data, train_labels, epochs=20, validation_data=(test_data,test_labels),batch_size=128,callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.callbacks.History at 0x1a5c912ac50>

In [131]:
Y.nunique()

category    10
dtype: int64

#### use head , short discription , keywords

In [132]:
combinedData=df['headline'].astype('str')+" "+df['short_description'].astype('str')+" "+df['keywords'].astype('str')

In [135]:
type(combinedData)

pandas.core.series.Series

In [140]:
cleanedcombinedData=cleanText(combinedData)

In [141]:
tokenizerCombine=tf.keras.preprocessing.text.Tokenizer(oov_token="OOV")

In [142]:
tokenizerCombine.fit_on_texts(cleanedcombinedData)

In [144]:
cleanedcombinedDataSeq=tokenizerCombine.texts_to_sequences(cleanedcombinedData)

In [145]:
cleanedcombinedDataPadSeq=tf.keras.preprocessing.sequence.pad_sequences(cleanedcombinedDataSeq,padding="post")

In [146]:
train_data, test_data, train_labels, test_labels = train_test_split(cleanedcombinedDataPadSeq,output, test_size=0.2, random_state=0)

In [147]:
vSize=len(tokenizerCombine.word_index)+1

In [159]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vSize, output_dim=50,input_length=train_data[0].shape[0]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(10, activation='softmax')
])

In [160]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [161]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [162]:
model.fit(train_data, train_labels, epochs=20, validation_data=(test_data,test_labels),batch_size=128,callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


<keras.callbacks.History at 0x1a5d1c02260>

In [163]:
model.save("NewsClassification.h5")

In [None]:
##