In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from sklearn import model_selection, metrics, preprocessing, ensemble, model_selection, metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Dropout, Input, SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sonali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# load
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [4]:
print("No. of missing data for column keyword: ", train['keyword'].isna().sum())
print("No. of missing data for column location: ", train['location'].isna().sum())
print("No. of missing data for column text: ", train['text'].isna().sum())
print("No. of missing data for column target: ", train['target'].isna().sum())

No. of missing data for column keyword:  61
No. of missing data for column location:  2533
No. of missing data for column text:  0
No. of missing data for column target:  0


In [6]:
# Clean text columns
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')


def clean_text(each_text):

    # remove URL from text
    each_text_no_url = re.sub(r"http\S+", "", each_text)
    
    # remove numbers from text
    text_no_num = re.sub(r'\d+', '', each_text_no_url)

    # tokenize each text
    word_tokens = word_tokenize(text_no_num)
    
    # remove sptial character
    clean_text = []
    for word in word_tokens:
        clean_text.append("".join([e for e in word if e.isalnum()]))

    # remove stop words and lower
    text_with_no_stop_word = [w.lower() for w in clean_text if not w in stop_words]  

    # do stemming
    stemmed_text = [stemmer.stem(w) for w in text_with_no_stop_word]
    
    return " ".join(" ".join(stemmed_text).split())


train['clean_text'] = train['text'].apply(lambda x: clean_text(x) )
train['keyword'] = train['keyword'].fillna("none")
train['clean_keyword'] = train['keyword'].apply(lambda x: clean_text(x) )

# Combining 'clean_keyword' and 'clean_text' into one column
train['keyword_text'] = train['clean_keyword'] + " " + train["clean_text"]

In [7]:
feature = 'keyword_text'
label = "target"

# split train and test
X_train, X_test,y_train, y_test = model_selection.train_test_split(train[feature],
                                                                   train[label],
                                                                   test_size=0.3,
                                                                   random_state=0, 
                                                                   shuffle=True)

In [8]:
X_train_GBC = X_train.values.reshape(-1)
x_test_GBC = X_test.values.reshape(-1)

# Vectorizing text
vectorizer = CountVectorizer()
X_train_GBC = vectorizer.fit_transform(X_train_GBC)
x_test_GBC = vectorizer.transform(x_test_GBC)

In [9]:
# Train the model
model = ensemble.GradientBoostingClassifier(learning_rate=0.1,                                            
                                            n_estimators=2000,
                                            max_depth=9,
                                            min_samples_split=6,
                                            min_samples_leaf=2,
                                            max_features=8,
                                            subsample=0.9)
model.fit(X_train_GBC, y_train)

GradientBoostingClassifier(max_depth=9, max_features=8, min_samples_leaf=2,
                           min_samples_split=6, n_estimators=2000,
                           subsample=0.9)

In [10]:
# Evaluate the model
predicted_prob = model.predict_proba(x_test_GBC)[:,1]
predicted = model.predict(x_test_GBC)

accuracy = metrics.accuracy_score(predicted, y_test)
print("Test accuracy: ", accuracy)
print(metrics.classification_report(y_test, predicted, target_names=["0", "1"]))
print("Test F-scoare: ", metrics.f1_score(y_test, predicted))

Test accuracy:  0.8025394045534151
              precision    recall  f1-score   support

           0       0.79      0.89      0.84      1338
           1       0.82      0.67      0.74       946

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.80      0.80      0.80      2284

Test F-scoare:  0.7385507246376811


In [11]:
test['clean_text'] = test['text'].apply(lambda x: clean_text(x) )
test['keyword'] = test['keyword'].fillna("none")
test['clean_keyword'] = test['keyword'].apply(lambda x: clean_text(x) )

# Combining 'clean_keyword' and 'clean_text' into one column
test['keyword_text'] = test['clean_keyword'] + " " + test["clean_text"]

In [12]:
train_GBC = train[feature].values.reshape(-1)
test_GBC = test[feature].values.reshape(-1)

# Vectorizing text
vectorizer = CountVectorizer()
train_GBC = vectorizer.fit_transform(train_GBC)
test_GBC = vectorizer.transform(test_GBC)

In [14]:
# Train the model
modelGb = ensemble.GradientBoostingClassifier(learning_rate=0.1,                                            
                                            n_estimators=2000,
                                            max_depth=9,
                                            min_samples_split=6,
                                            min_samples_leaf=2,
                                            max_features=8,
                                            subsample=0.9)
modelGb.fit(train_GBC, train[label])

GradientBoostingClassifier(max_depth=9, max_features=8, min_samples_leaf=2,
                           min_samples_split=6, n_estimators=2000,
                           subsample=0.9)

In [15]:
predictedGb = modelGb.predict(test_GBC)

sub_sample = pd.read_csv('sample_submission.csv')
submit = sub_sample.copy()
submit.target = predictedGb
submit.to_csv('sample_submission.csv',index=False)

### LSTM-RNN

In [51]:
# Some hyperparameters
path_to_glove_file = './glove.6B/glove.6B.300d.txt' #link: http://nlp.stanford.edu/data/glove.6B.zip
embedding_dim = 300
learning_rate = 1e-3
batch_size = 1024
epochs = 20
sequence_len = 100

In [52]:
# Define train and test labels
# y_train_LSTM = y_train.values.reshape(-1,1)
# y_test_LSTM = y_test.values.reshape(-1,1)

# print("Training Y shape:", y_train_LSTM.shape)
# print("Testing Y shape:", y_test_LSTM.shape)

train_LSTM = train[label].values.reshape(-1,1)

print("Training Y shape:", train_LSTM.shape)

Training Y shape: (7613, 1)


In [53]:
# Tokenize train data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print("Vocabulary Size: ", vocab_size)

Vocabulary Size:  7


In [54]:
# Pad train and test 
X_trainl = pad_sequences(tokenizer.texts_to_sequences(train[feature]), maxlen=sequence_len)
X_testl = pad_sequences(tokenizer.texts_to_sequences(test[feature]), maxlen=sequence_len)

print("Training X shape: ", X_trainl.shape)
print("Testing X shape: ", X_testl.shape)

Training X shape:  (7613, 100)
Testing X shape:  (3263, 100)


In [55]:
# Read word embeddings
embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [57]:
# Define embedding layer in Keras
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                            embedding_dim,
                                            weights=[embedding_matrix],
                                            input_length=sequence_len,
                                            trainable=False)

In [58]:
# Define model architecture
sequence_input = Input(shape=(sequence_len, ), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedding_sequences)
x = Bidirectional(LSTM(128, dropout=0.5, recurrent_dropout=0.2))(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
modelLSTM = Model(sequence_input, outputs)
modelLSTM.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 100, 300)          2100      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 96, 128)           192128    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_6 (Dense)              (None, 512)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               2626

In [59]:
# Optimize the model
modelLSTM.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])

In [60]:
# Train the LSTM Model
history = modelLSTM.fit(X_trainl,
                    train[label],
                    batch_size=batch_size,
                    epochs=epochs)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [61]:
# Evaluate the model
predictedl = modelLSTM.predict(X_testl, verbose=1, batch_size=10000)

y_predictedl = [1 if each > 0.5 else 0 for each in predictedl]

submit = sub_sample.copy()
submit.target = y_predictedl
submit.to_csv('sample_submission.csv',index=False)

