In [1]:

from keras.datasets import imdb
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input, MaxPooling1D, Conv1D, GlobalMaxPooling1D
from keras.layers import Embedding
from keras.models import Model
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras_preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

In [2]:
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following packages will be REMOVED:
  libcudnn8-dev
The following held packages will be changed:
  libcudnn8
The following packages will be DOWNGRADED:
  libcudnn8
0 upgraded, 0 newly installed, 1 downgraded, 1 to remove and 3 not upgraded.
Need to get 430 MB of archives.
After this operation, 1,392 MB disk space will be freed.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  libcudnn8 8.1.0.77-1+cuda11.2 [430 MB]
Fetched 430 MB in 9s (48.2 MB/s)
(Reading database ... 123991 files and directories currently installed.)
Removing libcudnn8-dev (8.1.1.33-1+cuda11.2) ...
update-alternatives: removing manually selected alternative - switching libcudnn to auto mode
(Reading database ... 123968 files and directories currently inst

In [3]:
data = pd.read_csv('refined_ds_news2.csv')
df = data[['cleaned_data']].copy()

In [4]:
cleaned_data = []
for i in range(df.shape[0]):
  cleaned_data.append(df.cleaned_data.values[0])
 
sentiment = data['Category']

In [5]:
y = np.array(list(map(lambda x: 1 if x=="news liberal" else 0, sentiment)))

X_train, X_test,Y_train, Y_test = train_test_split(cleaned_data, y, test_size=0.2, random_state = 45)

In [6]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

words_to_index = tokenizer.word_index

In [7]:
print(words_to_index)

{'trump': 1, 'view': 2, 'follow': 3, 'political': 4, 'story': 5, 'flynn': 6, 'gop': 7, 'right': 8, 'late': 9, 'perspective': 10, 'email': 11, 'american': 12, 'hay': 13, 'editor': 14, 'george': 15, 'president': 16, 'spectacle': 17, 'spectator': 18, 'news': 19, 'contact': 20, 'blog': 21, 'letter': 22, 'files': 23, 'neumayr': 24, 'prescription': 25, 'read': 26, 'receive': 27, 'foundation': 28, 'midterm': 29, 'david': 30, 'life': 31, 'time': 32, 'go': 33, 'thanksgiving': 34, 'scott': 35, 'mckay': 36, 'hither': 37, 'yon': 38, 'run': 39, 'party': 40, 'report': 41, 'culture': 42, 'democrats': 43, 'daily': 44, 'folios': 45, 'consecutive': 46, 'policy': 47, 'government': 48, 'special': 49, 'authors': 50, 'submissions': 51, 'magazine': 52, 'itxu': 53, 'díaz': 54, 'memoriam': 55, 'night': 56, 'daniel': 57, 'john': 58, 'support': 59, 'election': 60, 'editors': 61, 'obituary': 62, 'premature': 63, 'thankful': 64, 'god': 65, 'turkeys': 66, 'true': 67, 'indians': 68, 'paul': 69, 'kengor': 70, 'franci

In [8]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map

In [9]:
word_to_vec_map = read_glove_vector('glove.6B.50d.txt')

maxLen = 10000

In [10]:
vocab_len = len(words_to_index)+1
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))
print(emb_matrix.shape)
for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)


(509, 50)


In [11]:
emb_matrix.shape

(509, 50)

In [12]:
def news_classify(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [13]:
def conv1d_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = Conv1D(512,3,activation='relu')(embeddings)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  X = Dropout(0.8)(X)
  X = MaxPooling1D(3)(X)

  X = GlobalMaxPooling1D()(X)

  X = Dense(256, activation='relu')(X)
  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [18]:
model = news_classify((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10000)]           0         
                                                                 
 embedding (Embedding)       (None, 10000, 50)         25450     
                                                                 
 lstm (LSTM)                 (None, 10000, 128)        91648     
                                                                 
 dropout (Dropout)           (None, 10000, 128)        0         
                                                                 
 lstm_1 (LSTM)               (None, 10000, 128)        131584    
                                                                 
 dropout_1 (Dropout)         (None, 10000, 128)        0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584

In [14]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

In [15]:
X_train_indices.shape

(80, 10000)

In [19]:
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
epochs = 10
batch_size = 64
history = model.fit(X_train_indices, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/10
Epoch 2/10
