In [1]:
#import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, LSTM, Embedding
from tensorflow.keras.preprocessing import sequence 
from tensorflow.keras.utils import to_categorical

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Data Assignement

In [3]:
data = fetch_20newsgroups()
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
categoriesReligion = ['alt.atheism', 'soc.religion.christian']

In [6]:
data_toBeAnalyzed = fetch_20newsgroups(categories = categoriesReligion)

In [7]:
# data assignment (to X and y)
X = data_toBeAnalyzed['data']       #0:atheist, 1:christian
y = data_toBeAnalyzed['target']

In [8]:
# show length of data (X)
len(X)

1079

## Data Preprocessing

### Clean Text: vectorize, replace

Use count vectorizer to find the regex expression (called 'token_pattern') which can be replaced later with empty String

In [9]:
c = CountVectorizer()
c

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [None]:
#X[0]
#X[0].replace('\n','')

In [None]:
# replace 'token_pattern' (from CountVectorizer) with 
import re

def text_cleaning(text):
    cleaned_text = []
    for email in text:
        cleaned_text.append(re.findall('(?u)\\b\\w\\w+\\b', email))
    return cleaned_text

cleaned_X = text_cleaning(X)

### Determine words for multi-categorical output

In [None]:
# for output layer: get words as categorical 
word_list = []
for email in cleaned_X:
    for word in email:
        word_list.append(word)

# 'set' removes duplicates        
word_list = list(set(word_list))

# might be further tokenized:
#import spacy
#model_spacy = spacy.load('en_core_web_md')

### Forming word vectors

In [None]:
# make dictionaries for words and numbers and vice versa
word_to_number = {}
number_to_word = {}

for i, word in enumerate(word_list):
    word_to_number[i] = word
    number_to_word[word] = i

In [None]:
word_vectors = [[word_to_number[word] for word in email] for email in cleaned_X]

In [None]:
#np.array(word_vectors[0])

#### Check if it worked (cross-check)

In [None]:
word_vectors[0][0], cleaned_X[0][0]

In [None]:
number_to_word[20496], word_to_number['From']

### Find most longest text, then pad all sequences to match that longest mail in size --> input shape in NN has to be consistent! 

In [None]:
# max_len = w_list.sort(key=len)

### Sequence padding: adapt sizes of word vectors

In [None]:
padding_word_vectors = sequence.pad_sequences(word_vectors, maxlen=, padding='pre')

### Check for size of word_list

In [None]:
word_size = len(word_list) + 1

### Train, test, split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padding_word_vectors, y)

## Model definition

In [None]:
network = Sequential()

#input layer
network.add(Embedding(vocab_size,64, input_length=max_len))

#hidden layer
network.add(LSTM(512))

#output layer
network.add(Dense(1, activation='sigmoid'))

In [None]:
network.summary()

## Model compilation

In [None]:
network.compile(optimizer='rmsprop',
               loss='categorical_crossentropy',
               metrics=['accuracy']
               )

## Model fit

In [None]:
network.fit(X_train, to_categorical(y_train),
           epochs=5,
           batch_size=64,
           validation_split=0.2)

In [None]:
score_train = network.evaluate(X_train, y_train, batch_size=4)
print('score_train=', score_train)

## Predict

In [None]:
np.round(network.predict(X_test[0].reshape())

In [None]:
score_test = network.evaluate(X_test, y_test, batch_size=4)
print('score_test=', score_test)
score_pred = network.evaluate(X_test, y_pred, batch_size=4)
print('score_pred=', score_pred)

## Save model

In [None]:
network.save('model.h5')