In [1]:
import random
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

from tensorflow import keras

### Data import - Heart Failure

In [2]:
df = pd.read_csv("heart_failure.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [3]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


## Classification with multilayred NN

In [4]:
cols = ['age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction','high_blood_pressure','platelets','serum_creatinine','serum_sodium','sex','smoking','time','DEATH_EVENT']

### Normalisation

In [5]:
for column in cols[:-1]:
    df[column] = (df[column] - df[column].mean())/df[column].std()

### Data preparation

In [6]:
Y = df.iloc[:, -1].values.reshape(-1, 1)
Y = OneHotEncoder().fit_transform(Y).toarray()
X = df.iloc[:, :-1].values

num_samples = X.shape[0]
train_samples = int(0.75*num_samples)

indexes = np.arange(num_samples)
random.shuffle(indexes)

X, Y = X[indexes], Y[indexes]
X_train, Y_train = X[:train_samples, :], Y[:train_samples]
X_test, Y_test = X[train_samples:, :], Y[train_samples:]

### Model initialising

In [7]:
inputs = keras.Input(shape=(12,))
x = keras.layers.Dense(64, activation="relu")(inputs)

outputs = keras.layers.Dense(2, activation="softmax")(x)

fc_model = keras.Model(inputs=inputs, outputs=outputs, name="fc_model")
fc_model.summary()

Model: "fc_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 12)]              0         
_________________________________________________________________
dense (Dense)                (None, 64)                832       
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 962
Trainable params: 962
Non-trainable params: 0
_________________________________________________________________


In [8]:
fc_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
# training
fc_model.fit(X_train, Y_train, batch_size=4, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a2306ab220>

In [9]:
# evaluation
test_scores = fc_model.evaluate(X_test, Y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

3/3 - 0s - loss: 0.4285 - accuracy: 0.8133
Test loss: 0.4285261034965515
Test accuracy: 0.8133333325386047


### CNN

In [10]:
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

### Model initialising

In [11]:
inputs = keras.Input(shape=(28, 28, 1))

x = keras.layers.Conv2D(64, 3, activation="relu")(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Flatten()(x)

outputs = keras.layers.Dense(10)(x)

cnn_model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_model")
cnn_model.summary()

Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 26, 26, 64)        640       
_________________________________________________________________
batch_normalization (BatchNo (None, 26, 26, 64)        256       
_________________________________________________________________
flatten (Flatten)            (None, 43264)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                432650    
Total params: 433,546
Trainable params: 433,418
Non-trainable params: 128
_________________________________________________________________


### Training

In [12]:
cnn_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)

cnn_model.fit(x_train, y_train, batch_size=256, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a2383688e0>

### Eval

In [13]:
test_scores = cnn_model.evaluate(x_test, y_test, verbose=2)

print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

313/313 - 1s - loss: 0.6693 - accuracy: 0.8743
Test loss: 0.669260561466217
Test accuracy: 0.8743000030517578


### RNN

In [14]:
df = pd.read_csv("./twits_classification.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [15]:
def delete_stopwords(str_x):
    words = str_x.split(' ')
    neutral_words = ['people', 'wikipedia', 'one', 'say', 'page', 'know', 'go', 'back', 'take', 'see', 'look', 'article',
                     'edit', 'got', 'thing', 'want', 'make']
    new_words = list()
    for word in words:
        if word not in stopwords.words('english') or word not in neutral_words:
            new_words.append(word)
    return ' '.join(new_words)

In [16]:
from nltk import download
download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ocean\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# delete all unnecessary symbols
df["comment_text"] = df["comment_text"].map(lambda x: re.sub(r'[^\w]', ' ', x))
# lower all words
df["comment_text"] = df["comment_text"].map(lambda x: x.lower())
# delete all stopwords
df["comment_text"] = df["comment_text"].map(delete_stopwords)

In [18]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["comment_text"].values).toarray()
Y = df.iloc[:, 3:].values
num_samples = X.shape[0]
train_samples = int(0.75*num_samples)
indexes = np.arange(num_samples)
random.shuffle(indexes)
X, Y = X[indexes], Y[indexes]
x_train, y_train = X[:train_samples, :], Y[:train_samples]
x_test, y_test = X[train_samples:, :], Y[train_samples:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(750, 9374) (750, 6)
(250, 9374) (250, 6)


In [19]:
# define model
inputs = keras.Input(shape=(9374, 1))
x = keras.layers.LSTM(128)(inputs)
outputs = keras.layers.Dense(6, activation="softmax")(x)
rnn_model = keras.Model(inputs=inputs, outputs=outputs, name="rnn_model")
rnn_model.summary()

Model: "rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 9374, 1)]         0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               66560     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 774       
Total params: 67,334
Trainable params: 67,334
Non-trainable params: 0
_________________________________________________________________


In [None]:
rnn_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)

### Training

In [20]:
rnn_model.fit(x_train, y_train, batch_size=4, epochs=3, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1a231eaee50>

### Eval

In [21]:
test_scores = rnn_model.evaluate(x_test, y_test, verbose=2)

print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

8/8 - 9s - loss: 0.2515 - accuracy: 0.9960
Test loss: 0.2515318989753723
Test accuracy: 0.9959999918937683
