In [16]:
import os
import numpy as np
import pandas as pd
import time
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from urllib.parse import urlparse 
from keras.models import Sequential

In [17]:
data = pd.read_csv("../input/train.csv", encoding = "ISO-8859-1")
df2 = data.assign(Data = data.Title.astype(str) + ' ' + data.TRANS_CONV_TEXT.astype(str) + ' ' + data.Source.astype(str))
df2 = df2.drop(df2.columns[[1,2,3,4,5,6]], axis = 1)


In [18]:
df2.head()

Unnamed: 0,Source,Patient_Tag,Data
0,FORUMS,0,Epstein I don't disagree with you in principle...
1,FORUMS,1,Enlarged Heart.Thread Enlarged Heart I am alwa...
2,BLOG,0,Queen Latifah Joins American Heart Association...
3,FORUMS,1,Bulaemia I am 17 and I have been throwing up f...
4,FORUMS,0,DIY Silver interconnects and RCAs??? Quote: Or...


In [19]:
def remove_punctuation(text):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
def remove_url(data):
    if data.startswith('www.'):
        data = re.sub(r'www.', '', data)
    if data.startswith('http.'):
        data = re.sub(r'http.', '', data)
    domain = data.split("//")[-1].split("/")[0]
    return domain


In [20]:
# df2['Data'] = df2['Data'].apply(remove_url)
df2['Data'] = df2['Data'].apply(remove_punctuation)

In [21]:
df2.head()

Unnamed: 0,Source,Patient_Tag,Data
0,FORUMS,0,Epstein I dont disagree with you in principle ...
1,FORUMS,1,Enlarged HeartThread Enlarged Heart I am alway...
2,BLOG,0,Queen Latifah Joins American Heart Association...
3,FORUMS,1,Bulaemia I am 17 and I have been throwing up f...
4,FORUMS,0,DIY Silver interconnects and RCAs Quote Origin...


In [22]:
df2 = df2.drop(df2.columns[[0]], axis = 1)

sw = stopwords.words('english')
print(sw)

def stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)
df2['Data'] = df2['Data'].apply(stopwords)
df2.head()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Unnamed: 0,Patient_Tag,Data
0,0,epstein dont disagree principle im saying entr...
1,1,enlarged heartthread enlarged heart always diz...
2,0,queen latifah joins american heart association...
3,1,bulaemia 17 throwing year nowalmost everyday t...
4,0,diy silver interconnects rcas quote originally...


In [23]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
y = df2['Patient_Tag']
df2['Data'] = df2['Data'].apply(stemming)



In [24]:
df2.head()

Unnamed: 0,Patient_Tag,Data
0,0,epstein dont disagre principl im say entresto ...
1,1,enlarg heartthread enlarg heart alway dizzi ge...
2,0,queen latifah join american heart associ red s...
3,1,bulaemia 17 throw year nowalmost everyday thro...
4,0,diy silver interconnect rcas quot origin post ...


In [25]:
train_data, val_data = train_test_split(df2, test_size = 0.1, random_state = 2020)

#Text preprocessing
train_y = train_data['Patient_Tag'].values
val_y = val_data['Patient_Tag'].values
train_data = train_data.drop('Patient_Tag', axis = 1)
val_data = val_data.drop('Patient_Tag', axis = 1)
train_x = train_data["Data"].fillna("_na_").values
val_x = val_data["Data"].fillna("_na_").values


embed_size = 500
max_features = 95000
max_len = 1200


In [26]:
length = []
for x in train_data["Data"]:
    length.append(len(x.split()))
print("Maximum length of question text is: ",max(length))


Maximum length of question text is:  1561


In [27]:
tokenizer = Tokenizer(num_words = max_features, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~...=', lower=True) #Taken care of filters and lower case
tokenizer.fit_on_texts(train_x)
print(tokenizer.word_counts) 

OrderedDict([('stem', 113), ('cell', 324), ('therapi', 246), ('halv', 9), ('death', 345), ('heart', 2732), ('failur', 1240), ('im', 188), ('begin', 58), ('suspect', 25), ('even', 311), ('adult', 121), ('ive', 101), ('got', 81), ('read', 175), ('articl', 108), ('line', 69), ('appear', 90), ('add', 44), ('5', 141), ('year', 543), ('someon', 75), ('life', 271), ('see', 247), ('make', 330), ('big', 60), ('diff', 3), ('long', 145), ('run', 55), ('put', 111), ('inevit', 6), ('cure', 29), ('delay', 22), ('bit', 40), ('wont', 31), ('complain', 15), ('situationmayb', 2), ('could', 318), ('actual', 112), ('come', 156), ('someth', 106), ('work', 283), ('unlik', 22), ('forum', 507), ('nebraska', 1), ('beyond', 23), ('sunday', 47), ('58', 5), ('happi', 50), ('mother', 78), ('day', 358), ('enjoy', 34), ('sure', 76), ('nasti', 3), ('outsid', 23), ('today', 138), ('friend', 79), ('karen', 3), ('hard', 55), ('time', 535), ('breath', 156), ('warn', 41), ('go', 320), ('asthma', 37), ('get', 475), ('conge

In [28]:
train_x = tokenizer.texts_to_sequences(train_x)
val_x = tokenizer.texts_to_sequences(val_x)
train_x = pad_sequences(train_x, maxlen = max_len)
val_x = pad_sequences(val_x, maxlen = max_len)

In [29]:
mod = Sequential()
mod.add(layers.Embedding((max_features), embed_size))
mod.add(layers.Bidirectional(CuDNNGRU(64,return_sequences = True)))
mod.add(layers.Dense(64, activation='relu'))
mod.add(layers.GlobalMaxPool1D())
mod.add(layers.Dense(16, activation='relu'))
mod.add(layers.Dropout(0.1))
mod.add(layers.Dense(1, activation='sigmoid'))
mod.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
mod.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 500)         47500000  
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         217344    
_________________________________________________________________
dense_4 (Dense)              (None, None, 64)          8256      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 16)                1040      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total para

In [30]:
mod.fit(train_x, train_y, batch_size = 512, epochs = 20, validation_data = (val_x, val_y))

Train on 833 samples, validate on 93 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fcf8e13ac88>