In [1]:
#import libraries
import sklearn
import numpy as np
import pandas as pd
import sklearn.metrics
import random

from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups


#Importing Data

###20 newsgroups

In [2]:
#category selection
categories = ['alt.atheism', 'soc.religion.christian']

#fetching data
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)


In [3]:
#putting data in dataframe
newsgroups_train_df = pd.DataFrame({'Text' : newsgroups_train.data})
newsgroups_train_df['Target'] = newsgroups_train.target

newsgroups_bulk_df = pd.DataFrame({'Text' : newsgroups_test.data})
newsgroups_bulk_df['Target'] = newsgroups_test.target

news_df = newsgroups_train_df.append(newsgroups_bulk_df)
## 1 - atheist, 0 - christian
news_df.head()

Unnamed: 0,Text,Target
0,From: nigel.allen@canrem.com (Nigel Allen)\nSu...,1
1,From: marshall@csugrad.cs.vt.edu (Kevin Marsha...,0
2,From: tedr@athena.cs.uga.edu (Ted Kalivoda)\nS...,1
3,From: keith@cco.caltech.edu (Keith Allan Schne...,0
4,From: mayne@ds3.scri.fsu.edu (Bill Mayne)\nSub...,1


###IMDB

In [4]:
imdb_df = pd.read_csv("/content/IMDB Dataset.csv", usecols=["review", "sentiment"], encoding='latin-1')
## 1 - positive, 0 - negative
imdb_df.sentiment = (imdb_df.sentiment == "positive").astype("int")
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#Data Cleaning/Transformation

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
from string import punctuation
import re

#cleaning data method
def clean(text_list):
  clean_data = []
  for text in text_list:
    #lowercase
    text=text.lower()
    #remove non-alphanumeric
    text = re.sub('\W+',' ', text )
    clean_data.append(text)

  return clean_data

###20 newsgroups

In [7]:
#cleaning
news_df["Clean"] = clean(news_df["Text"])

news_df["Clean"].sample()

#y one hot

909    from madhaus netcom com maddi hausmann subject...
Name: Clean, dtype: object

In [8]:
#tokenization

max_features = 2000
news_tokenizer = Tokenizer(num_words=max_features, split=' ')

#convert to tokens
X_news_token = news_tokenizer.fit_on_texts(news_df["Clean"])
#convert to sequence
X_news_sequence = news_tokenizer.texts_to_sequences(news_df["Clean"])
#padding
news_X = pad_sequences(X_news_sequence)

news_y = pd.get_dummies(news_df['Target']).values

In [9]:
#train_test_val_split
#train 0.6, val 0.2, test 0.2
X_bulk_news, X_test_news, y_bulk_news, y_test_news = train_test_split(news_X, news_y, test_size=0.2)
X_train_news, X_val_news, y_train_news, y_val_news = train_test_split(X_bulk_news, y_bulk_news, test_size=0.25)

###IMDB

In [10]:
#cleaning
imdb_df["Clean"] = clean(imdb_df["review"])

imdb_df["Clean"].sample()


289    i would like to know if anyone know how i can ...
Name: Clean, dtype: object

In [11]:
#tokenization

max_features = 2000
imdb_tokenizer = Tokenizer(num_words=max_features, split=' ')

#convert to tokens
X_imdb_token = imdb_tokenizer.fit_on_texts(imdb_df["Clean"])
#convert to sequence
X_imdb_sequence = imdb_tokenizer.texts_to_sequences(imdb_df["Clean"])
#padding
imdb_X = pad_sequences(X_imdb_sequence)

imdb_y = pd.get_dummies(imdb_df['sentiment']).values

In [12]:
#train_test_val_split
#train 0.6, val 0.2, test 0.2
X_bulk_imdb, X_test_imdb, y_bulk_imdb, y_test_imdb = train_test_split(imdb_X, imdb_y, test_size=0.2)
X_train_imdb, X_val_imdb, y_train_imdb, y_val_imdb = train_test_split(X_bulk_imdb, y_bulk_imdb, test_size=0.25)

#Model Creation

In [14]:
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate,AveragePooling2D
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.models import Model
from keras import regularizers

###20 newsgroups

In [16]:
def simple_news():
  embed_dim = 128
  news_input_length=news_X.shape[1]
  filter_size = 3
  num_filters = 100
  drop = 0.5
  num_classes=2

  embedding_layer = Embedding(max_features, embed_dim)
  inputs = Input(shape=(news_input_length,))
  embedding = embedding_layer(inputs)
  reshape = Reshape((news_input_length,embed_dim,1))(embedding)

  conv_0 = Conv2D(num_filters, (filter_size, embed_dim),activation='tanh',kernel_regularizer=regularizers.l2(0.01))(reshape)
  maxpool_0 = MaxPooling2D((news_input_length - filter_size + 1, 1), strides=(1,1))(conv_0)

  flatten = Flatten()(maxpool_0)

  fc2=Dense(units=84, activation='tanh',kernel_regularizer=regularizers.l2(0.01))(flatten)
  dropout = Dropout(drop)(fc2)

  output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

  model = Model(inputs, output)

  return model

news_model=simple_news()
news_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 7472)]            0         
                                                                 
 embedding (Embedding)       (None, 7472, 128)         256000    
                                                                 
 reshape (Reshape)           (None, 7472, 128, 1)      0         
                                                                 
 conv2d (Conv2D)             (None, 7470, 1, 100)      38500     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 1, 1, 100)        0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 100)               0         
                                                             

In [17]:
news_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])


###IMDB

In [18]:
def simple_imdb():
  embed_dim = 128
  imdb_input_length=imdb_X.shape[1]
  filter_size = 3
  num_filters = 100
  drop = 0.5
  num_classes=2

  embedding_layer = Embedding(max_features, embed_dim)
  inputs = Input(shape=(imdb_input_length,))
  embedding = embedding_layer(inputs)
  reshape = Reshape((imdb_input_length,embed_dim,1))(embedding)

  conv_0 = Conv2D(num_filters, (filter_size, embed_dim),activation='tanh',kernel_regularizer=regularizers.l2(0.01))(reshape)
  maxpool_0 = MaxPooling2D((imdb_input_length - filter_size + 1, 1), strides=(1,1))(conv_0)

  flatten = Flatten()(maxpool_0)

  fc2=Dense(units=84, activation='tanh',kernel_regularizer=regularizers.l2(0.01))(flatten)
  dropout = Dropout(drop)(fc2)

  output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

  model = Model(inputs, output)

  return model

imdb_model=simple_imdb()
imdb_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 2017)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 2017, 128)         256000    
                                                                 
 reshape_1 (Reshape)         (None, 2017, 128, 1)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 2015, 1, 100)      38500     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 1, 1, 100)        0         
 2D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 100)               0         
                                                           

In [22]:
imdb_model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])


#Training

###20 newsgroups

In [20]:
news_fit = news_model.fit(X_train_news, y_train_news, batch_size=10, epochs=5, verbose=1, validation_data=(X_val_news, y_val_news))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###IMDB

In [24]:
imdb_fit = imdb_model.fit(X_train_imdb, y_train_imdb, batch_size=256, epochs=5, verbose=1, validation_data=(X_val_imdb, y_val_imdb))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#Results

###20 newsgroups

In [25]:
news_loss, news_acc = news_model.evaluate(X_test_news, y_test_news, verbose=0)
print('Test loss:', news_loss)
print('Test accuracy:', news_acc)

Test loss: 0.18993638455867767
Test accuracy: 0.9722222089767456


###IMDB

In [26]:
imdb_loss, imdb_acc = imdb_model.evaluate(X_test_imdb, y_test_imdb, verbose=0)
print('Test loss:', imdb_loss)
print('Test accuracy:', imdb_acc)

Test loss: 0.4509839713573456
Test accuracy: 0.8513000011444092
