In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.linear_model import SGDClassifier
import logging
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Import Dataset/Downloaded from Kaggle

In [2]:
df_news = pd.read_json("/Users/orah82/Downloads/News_Category_Dataset_v2.json", lines = True)
df_news.columns

Index(['authors', 'category', 'date', 'headline', 'link', 'short_description'], dtype='object')

In [3]:
df_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 6 columns):
authors              200853 non-null object
category             200853 non-null object
date                 200853 non-null datetime64[ns]
headline             200853 non-null object
link                 200853 non-null object
short_description    200853 non-null object
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


### First Rows of Data

In [4]:
df_news.head(10)

Unnamed: 0,authors,category,date,headline,link,short_description
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ..."
5,Ron Dicker,ENTERTAINMENT,2018-05-26,Morgan Freeman 'Devastated' That Sexual Harass...,https://www.huffingtonpost.com/entry/morgan-fr...,"""It is not right to equate horrific incidents ..."
6,Ron Dicker,ENTERTAINMENT,2018-05-26,Donald Trump Is Lovin' New McDonald's Jingle I...,https://www.huffingtonpost.com/entry/donald-tr...,"It's catchy, all right."
7,Todd Van Luling,ENTERTAINMENT,2018-05-26,What To Watch On Amazon Prime That’s New This ...,https://www.huffingtonpost.com/entry/amazon-pr...,There's a great mini-series joining this week.
8,Andy McDonald,ENTERTAINMENT,2018-05-26,Mike Myers Reveals He'd 'Like To' Do A Fourth ...,https://www.huffingtonpost.com/entry/mike-myer...,"Myer's kids may be pushing for a new ""Powers"" ..."
9,Todd Van Luling,ENTERTAINMENT,2018-05-26,What To Watch On Hulu That’s New This Week,https://www.huffingtonpost.com/entry/hulu-what...,You're getting a recent Academy Award-winning ...


### List all Categories 

In [5]:
print(df_news['category'].unique())

['CRIME' 'ENTERTAINMENT' 'WORLD NEWS' 'IMPACT' 'POLITICS' 'WEIRD NEWS'
 'BLACK VOICES' 'WOMEN' 'COMEDY' 'QUEER VOICES' 'SPORTS' 'BUSINESS'
 'TRAVEL' 'MEDIA' 'TECH' 'RELIGION' 'SCIENCE' 'LATINO VOICES' 'EDUCATION'
 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'THE WORLDPOST' 'GOOD NEWS' 'WORLDPOST' 'FIFTY' 'ARTS'
 'WELLNESS' 'PARENTING' 'HOME & LIVING' 'STYLE & BEAUTY' 'DIVORCE'
 'WEDDINGS' 'FOOD & DRINK' 'MONEY' 'ENVIRONMENT' 'CULTURE & ARTS']


In [6]:
len(df_news['category'].unique())

41

In [7]:
df_news['category'].value_counts().plot(kind='bar', figsize=(18,12))


<matplotlib.axes._subplots.AxesSubplot at 0x10b95bf60>

### Merge WorldPost and The WorldPost

In [8]:
#Bascially the same categories 
df_news.category = df_news.category.map(lambda x: "WORLDPOST" if x == "THEWORLDPOST" else x)

### Merge Headline and Short Description into new column named "Text"

In [9]:
df_news['text'] = df_news['headline'] +" "+ df_news['short_description']

### Data pre-processing

load in NLTK utilities 

In [10]:
import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/orah82/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/orah82/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/orah82/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/orah82/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### text cleaning function, removes stop words and non alphanumeric for data

In [11]:
stop_words = set(stopwords.words('english'))
wn = WordNetLemmatizer()

In [12]:
my_sw = ['make', 'amp', 'news', 'new', 'time', 'u', 's', 'photos', 'get', 'say',]
def black_txt(token):
    return token not in stop_words and token not in list(string.punctuation) and len(token)>2 and token not in my_sw

In [13]:
def clean_text(text):
    clean_text = []
    clean_text2 = []
    text = re.sub("'", "", text)
    text=re.sub("\\d|\\W+"," ",text)
    clean_text =[wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if black_txt(word)]
    clean_text2 = [word for word in clean_text if black_txt(word)]
    return " ".join(clean_text2)

In [14]:
df_news.short_description[0]

'She left her husband. He killed their children. Just another day in America.'

### Processing the data and TF-IDF

Example Output

In [15]:
df_news.text[1]

"Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song Of course it has a song."

In [16]:
clean_text(df_news.text[1])

'smith join diplo nicky jam world cup official song course song'

In [17]:
clean_text(df_news.text[8])

'mike myers reveal hed like fourth austin power film myers kid may push power film anyone'

### Next we are going to create some news variables columns(like metadata) to improve the quality of our classifier with the help if textblob package. We will make:

- Polarity: to check the sentiment of the text
- Subjectivity: to check if the text is objective or subjective
- Len: The number of word in the text 

In [18]:
blob = TextBlob((df_news.text[2]))
str(blob.correct())

'Hugh Grant Carries For The First Time It Age 57 The actor and his longtime girlfriend Anna Eberstein tied the knot in a civil ceremony.'

In [19]:
def polarity_txt(text):
    return TextBlob(text).sentiment[0]

In [20]:
def subj_txt(text):
    return TextBlob(text).sentiment[1]

In [21]:
def len_text(text):
    if len(text.split())>0:
        return len(set(clean_text(text).split()))/ len(text.split())
    else:
        return 0

In [22]:
df_news['polarity'] = df_news['text'].apply(polarity_txt)
df_news.head(2)

Unnamed: 0,authors,category,date,headline,link,short_description,text,polarity
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,-0.05
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0


In [23]:
df_news['subjectivity'] = df_news['text'].apply(subj_txt)
df_news.head(2)

Unnamed: 0,authors,category,date,headline,link,short_description,text,polarity,subjectivity
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,-0.05,0.266667
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0,0.0


In [24]:
df_news['len'] = df_news['text'].apply(len_text)
df_news.head()

Unnamed: 0,authors,category,date,headline,link,short_description,text,polarity,subjectivity,len
0,Melissa Jeltsen,CRIME,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,-0.05,0.266667,0.444444
1,Andy McDonald,ENTERTAINMENT,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,0.0,0.0,0.5
2,Ron Dicker,ENTERTAINMENT,2018-05-26,Hugh Grant Marries For The First Time At Age 57,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 5...,0.25,0.333333,0.56
3,Ron Dicker,ENTERTAINMENT,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,-0.051768,0.498737,0.72
4,Ron Dicker,ENTERTAINMENT,2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,0.2,0.2,0.576923


### Make the custom class for feature union transformer of sklearn

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key]
    

# Extracts features from each document for DictVectorizer
class TextStats(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return [{'pos': row['polarity'], 'sub': row['subjectivity'], 'len':row['len']} for _,row in data.iterrows()]
    

### Make Pipeline

In [27]:
pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
        #Pipeline to pull features from text
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')),
                ('tfidf', TfidfVectorizer(min_df =3, max_df=0.2, max_features=None,
                  strip_accents='unicode', analyzer = 'word', token_pattern=r'\w{1,}',
                  ngram_range=(1,10), use_idf=1, smooth_idf=1,sublinear_tf=1,
                  stop_words= None, preprocessor=clean_text)),
            ])),
        #Pipeline to pull metadata features
            ('stats', Pipeline([
                ('selector', ItemSelector(key=['polarity', 'subjectivity', 'len'])),
                ('stats', TextStats()), #returns a list of dicts
                ('vect', DictVectorizer()), #puts dicts in feature matrix
            ])),
    
        ],
        
        
        #weight components in FeatureUnion
        transformer_weights={
            'text': 0.9,
            'stats': 1.5,
        },
    
    ))
])

### Build pipeline

In [28]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed = 42
X = df_news[['text', 'polarity', 'subjectivity', 'len']]
y = df_news['category']
encoder = LabelEncoder()
y= encoder.fit_transform(y)
x_train, x_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state= seed, stratify=y)

In [29]:
pipeline.fit(x_train)

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='text')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='conte...'=', sort=True,
        sparse=True))]))],
       transformer_weights={'text': 0.9, 'stats': 1.5}))])

### Transform and train the Machine Learning Models

In [30]:
%%time 
train_vec = pipeline.transform(x_train)
test_vec = pipeline.transform(x_test)
print("Checking that the number of features in train and test correspond: %s - %s" % (train_vec.shape, test_vec.shape))

Checking that the number of features in train and test correspond: (160682, 190148) - (40171, 190148)
CPU times: user 1min 31s, sys: 576 ms, total: 1min 32s
Wall time: 1min 31s


In [31]:
clf_sv  = LinearSVC(C=1, class_weight='balanced', multi_class='ovr', random_state=40, max_iter=10000)#Support Vector Machines
clf_sgd = SGDClassifier(max_iter=200) #Stochastic Gradient Classifier

In [34]:
from sklearn.model_selection import cross_val_score

clfs = [clf_sv, clf_sgd]
cv =3
for clf in clfs:
    scores = cross_val_score(clf, pipeline.transform(x_train), y_train, cv=cv, scoring ="accuracy")
    print(scores)
    print(("Mean score: {0:3f} (+/-{1:3f})").format(
    np.mean(scores), np.std(scores)))

[0.59964908 0.60032112 0.60045568]
Mean score: 0.600142 (+/-0.000353)




[0.57607377 0.5796912  0.58009935]
Mean score: 0.578621 (+/-0.001809)


In [35]:
from sklearn.metrics import  classification_report
clf_sv.fit(pipeline.transform(x_train), y_train)
y_pred = clf_sv.predict(pipeline.transform(x_test))
list_result = []
list_result.append(('SVC',accuracy_score(y_test, y_pred)))
clf_sgd.fit(pipeline.transform(x_train), y_train)
y_pred = clf_sgd.predict(pipeline.transform(x_test))
list_result.append(("SGD", accuracy_score(y_test, y_pred)))

In [36]:
import spacy
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz#egg=en_core_web_lg==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    100% |████████████████████████████████| 852.3MB 18.9MB/s ta 0:00:011    46% |███████████████                 | 398.5MB 45.8MB/s eta 0:00:10    58% |██████████████████▉             | 500.5MB 37.9MB/s eta 0:00:10    72% |███████████████████████         | 615.2MB 59.4MB/s eta 0:00:04    76% |████████████████████████▋       | 655.7MB 41.8MB/s eta 0:00:05    82% |██████████████████████████▌     | 705.3MB 37.7MB/s eta 0:00:04    96% |██████████████████████████████▊ | 819.4MB 36.1MB/s eta 0:00:01
[?25hInstalling collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg ... [?25ldone
[?25hSuccessfully installed en-core-web-lg-2.0.0

[93m    Linking succe

### Neural Net and Spacy Models 

In [37]:
nlp =spacy.load('en_core_web_lg')

In [99]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, LSTM, Embedding
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

from tensorflow.keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, Add, Flatten, SpatialDropout1D
from tensorflow.keras.layers import GlobalAveragePooling1D, BatchNormalization, concatenate, GRU
from keras.layers import Reshape, merge, Concatenate, Lambda, Average
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer
from keras import initializers, regularizers, constraints

from tensorflow.keras.layers import Dense, Input, LSTM, Bidirectional, Activation, Conv1D, GRU, TimeDistributed
from tensorflow.python.framework import ops
ops.reset_default_graph()


In [41]:
from sklearn.model_selection import train_test_split
import time
X = df_news['text']
y = df_news['category']
encoder = LabelEncoder()
y= encoder.fit_transform(y)
Y = np_utils.to_categorical(y)
vectorizer = TfidfVectorizer(min_df=3, max_df=0.2, max_features=None,
                            strip_accents= 'unicode', analyzer ='word', token_pattern=r'\w{1,}',
                            use_idf=1, smooth_idf=1, sublinear_tf=1,
                            stop_words=None, preprocessor=clean_text)

In [42]:
seed =42 
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed,
                                                   stratify = y)
vectorizer.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function clean_text at 0x1a340a12f0>, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [44]:
word2idx ={word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()

def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes

X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in x_train]
print(X_train_sequences[0])

[9816, 20426, 10559, 25144, 4603, 15407, 20097, 19671, 8892, 19671, 25144, 4603, 26387, 11170, 3994, 12035, 20252, 19001, 9816, 19001]


In [49]:
# Calulate the max length of a text

MAX_SEQ_LENGTH = 60

N_FEATURES = len(vectorizer.get_feature_names())
X_train_sequences = pad_sequences(X_train_sequences, maxlen= MAX_SEQ_LENGTH, value=N_FEATURES)

print(X_train_sequences[0])

[30022 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022
 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022
 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022 30022
 30022 30022 30022 30022  9816 20426 10559 25144  4603 15407 20097 19671
  8892 19671 25144  4603 26387 11170  3994 12035 20252 19001  9816 19001]


In [52]:
X_test_sequences = [to_sequence(tokenize, preprocess,word2idx,x) for x in x_test]
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGTH, value=N_FEATURES)

### Making the spacy embeding

In [53]:
EMBEDDING_LEN = 300

embedding_index =np.zeros((len(vectorizer.get_feature_names()) + 1, EMBEDDING_LEN))
for word, idx in word2idx.items():
    try: 
        embedding = nlp.vocab[word].vector
        embedding_index[idx] = embedding
    except:
        pass
    
print("EMBEDDING_LEN=", EMBEDDING_LEN)        

EMBEDDING_LEN= 300


### LSTM Model

In [67]:
model = Sequential()
model.add(Embedding(len(vectorizer.get_feature_names())+1,
                   EMBEDDING_LEN, #Embedding Size
                   weights= [embedding_index],
                   input_length= MAX_SEQ_LENGTH,
                   trainable=False))
model.add(LSTM(300, dropout = 0.2))
model.add(Dense(len(set(y)), activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 300)           9006900   
_________________________________________________________________
unified_lstm_1 (UnifiedLSTM) (None, 300)               721200    
_________________________________________________________________
dense_1 (Dense)              (None, 41)                12341     
Total params: 9,740,441
Trainable params: 733,541
Non-trainable params: 9,006,900
_________________________________________________________________
None


In [69]:
model.fit(X_train_sequences,y_train,
          epochs=15, batch_size=150, verbose=1,
          validation_split=0.1)

scores = model.evaluate(X_test_sequences,y_test, verbose =1)
print("Accuracy:", scores[1]) #model score
list_result.append(("LSTM", scores[1]))

Train on 144613 samples, validate on 16069 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy: 0.53919494


In [70]:
len(x_train.keys())

160682

### Model LSTM and concatenate new columns

In [78]:
import tensorflow as tf
from tensorflow.keras.models import Model


In [80]:
text_data = Input(shape=(MAX_SEQ_LENGTH,), name='text')
meta_data = Input(shape=(3,),name ='meta')
x=(Embedding(len(vectorizer.get_feature_names())+1,
                    EMBEDDING_LEN,
                    weights=[embedding_index],
                    input_length=MAX_SEQ_LENGTH,
                    trainable= False))(text_data)
x2= ((LSTM(300, dropout=0.2, recurrent_dropout=0.2)))(x)
x4= concatenate([x2, meta_data])
x5= Dense(150, activation='relu')(x4)
x6= Dropout(0.25)(x5)
x7=BatchNormalization()(x6)
out=(Dense(len(set(y)), activation='softmax'))(x7)
model= Model(inputs=[text_data, meta_data], outputs=out)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 300)      9006900     text[0][0]                       
__________________________________________________________________________________________________
unified_lstm_2 (UnifiedLSTM)    (None, 300)          721200      embedding_2[0][0]                
__________________________________________________________________________________________________
meta (InputLayer)               [(None, 3)]          0                                            
____________________________________________________________________________________________

In [81]:
df_cat_train = df_news.iloc[x_train.index][['polarity', 'subjectivity', 'len']]
df_cat_test = df_news.iloc[x_test.index][['polarity', 'subjectivity', 'len']]

In [83]:
model.fit([X_train_sequences, df_cat_train], y_train,
         epochs=12, batch_size=128, verbose=1,
         validation_split=0.1)

scores= model.evaluate([X_test_sequences, df_cat_test], y_test, verbose=1)
print('Accuracy:', scores[1])
list_result.append(("LSTM with Multi-Input", scores[1]))

Train on 144613 samples, validate on 16069 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12
Accuracy: 0.5470613


### LSTM with Attention

In [101]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None:
            a *= K.cast(mask, K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
    

x=(Embedding(len(vectorizer.get_feature_names())+1,
                    EMBEDDING_LEN,
                    weights=[embedding_index],
                    input_length=MAX_SEQ_LENGTH,
                    trainable= False))(text_data)
x2= ((LSTM(300, dropout=0.2, recurrent_dropout=0.2)))(x)
x4= concatenate([x2, meta_data])
x5= Dense(150, activation='relu')(x4)
x6= Dropout(0.25)(x5)
x7=BatchNormalization()(x6)
out=(Dense(len(set(y)), activation='softmax'))(x7)

AttentionLSTM = Model(inputs=[text_data, meta_data ], outputs=out)
AttentionLSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

AttentionLSTM.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, 60)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 60, 300)      9006900     text[0][0]                       
__________________________________________________________________________________________________
unified_lstm_1 (UnifiedLSTM)    (None, 300)          721200      embedding_1[0][0]                
__________________________________________________________________________________________________
meta (InputLayer)               [(None, 3)]          0                                            
______________________________________________________________________________________________

In [102]:
AttentionLSTM.fit([X_train_sequences, df_cat_train], y_train, 
          epochs=13, batch_size=128, verbose=1, 
          validation_split=0.1)
 
scores = AttentionLSTM.evaluate([X_test_sequences, df_cat_test],y_test, verbose=1)
print("Accuracy:", scores[1])  # 
list_result.append(("LSTM with Attention", scores[1]))

Train on 144613 samples, validate on 16069 samples
Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13
Accuracy: 0.53844815


### Model Comparison

In [103]:
pd.DataFrame(list_result, columns=['model', 'accuracy'])

Unnamed: 0,model,accuracy
0,SVC,0.604939
1,SGD,0.581638
2,LSTM,0.539195
3,LSTM with Multi-Input,0.547061
4,LSTM with Attention,0.538448


### Do news articles from different categories have different writing styles?

#### Top Words by Category

In [120]:
vectorizer = TfidfVectorizer( min_df =3, max_df=0.2, max_features=None, 
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1, 1), use_idf=1,smooth_idf=1,sublinear_tf=1,
                    stop_words = None, preprocessor=clean_text)
vectorizer.fit(df_news.category)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.2, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function clean_text at 0x1a340a12f0>, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [121]:
def create_tf_matrix(category):
    return vectorizer.transform(df_news[df_news.category == category].text)

def create_term_freq(matrix, cat):
  category_words = matrix.sum(axis=0)
  category_words_freq = [(word, category_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
  return pd.DataFrame(list(sorted(category_words_freq, key = lambda x: x[1], reverse=True)),columns=['Terms', cat])

for cat in df_news.category.unique():
  print("Top 10 terms for: ", cat)
  df_right = create_term_freq(create_tf_matrix(cat), cat).head(10)
  print(df_right)
  print("###############")
  if cat != 'CRIME':
    df_top5_words = df_top5_words.merge(df_right, how='outer')
  else:
    df_top5_words = df_right.copy()
  print(df_top5_words.shape )

Top 10 terms for:  CRIME
     Terms       CRIME
0     home  122.780746
1    crime   61.279350
2    black   54.603386
3    drink   50.149849
4     live   47.139003
5   parent   42.693406
6    women   42.479414
7  college   26.158697
8     good   24.768079
9    money   22.500460
###############
(10, 2)
Top 10 terms for:  ENTERTAINMENT
    Terms  ENTERTAINMENT
0    live     356.901616
1    good     336.142522
2   world     331.985059
3   women     239.054035
4   black     204.679678
5    home     153.132509
6  comedy     145.462310
7   media     114.164769
8   voice     109.243520
9  parent      79.957844
###############
(14, 3)
Top 10 terms for:  WORLD NEWS
      Terms  WORLD NEWS
0     world  107.477013
1     women   57.656851
2     media   30.497552
3      live   27.490777
4      home   26.794834
5      good   15.763163
6     black   13.506478
7    travel   12.679764
8      food   12.658104
9  politics   10.013183
###############
(17, 4)
Top 10 terms for:  IMPACT
       Terms      IMPA

    Terms  THE WORLDPOST
0   world     173.278804
1   women      80.096571
2    live      73.967054
3   media      64.619845
4    home      61.625699
5  travel      20.917176
6    good      19.760364
7   crime      19.265028
8    food      18.494040
9   voice      14.081013
###############
(35, 28)
Top 10 terms for:  GOOD NEWS
     Terms  GOOD NEWS
0     home  66.795463
1    world  52.287612
2     live  47.401297
3     good  36.989055
4     food  21.952309
5    money  18.645634
6  college  13.307826
7   parent   8.693309
8    style   6.000000
9    media   5.820593
###############
(35, 29)
Top 10 terms for:  WORLDPOST
      Terms   WORLDPOST
0     world  212.983439
1      live   55.195174
2     women   48.587433
3      home   28.514555
4     media   25.608106
5      good   21.343159
6  politics   18.544320
7  business   17.713685
8   culture   17.280580
9     voice   14.729446
###############
(35, 30)
Top 10 terms for:  FIFTY
     Terms       FIFTY
0     live  106.491242
1    women   63