In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import time
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import classification_report
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

In [2]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [3]:
train_data = train.text.values
train_labels = train.target.values
test_data = test.text.values

In [4]:
count_vectorizer = feature_extraction.text.CountVectorizer()

train_vectors = count_vectorizer.fit_transform(train_data)
test_vectors = count_vectorizer.transform(test_data)

In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_vectors, train_labels, test_size=0.7, random_state=5)

In [6]:
%%time
#Logistic regression

params = {'C': [0.72],}

lr = LogisticRegression()
lrgrid = model_selection.GridSearchCV(lr, param_grid=params, cv=3, verbose=1, n_jobs=-1)
lrgrid.fit(X_train, y_train)

print(lrgrid.best_params_)
print(lrgrid.best_score_)

lr_pred = lrgrid.predict(X_test)
print(classification_report(y_test, lr_pred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


{'C': 0.72}
0.7555847568988173
              precision    recall  f1-score   support

           0       0.78      0.87      0.82      3024
           1       0.80      0.68      0.74      2306

    accuracy                           0.79      5330
   macro avg       0.79      0.78      0.78      5330
weighted avg       0.79      0.79      0.79      5330

CPU times: user 222 ms, sys: 239 ms, total: 461 ms
Wall time: 2.64 s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.5s finished


In [7]:
%%time
#Bagging

params = {'n_estimators': [100],
         'max_features': [1.0],
         'max_samples': [1.0],}

blr = BaggingClassifier(base_estimator=LogisticRegression(C=0.72))
blrgrid = model_selection.GridSearchCV(blr, param_grid=params, cv=3, verbose=1, n_jobs=-1)
blrgrid.fit(X_train, y_train)

print(blrgrid.best_params_)
print(blrgrid.best_score_)

blr_pred = blrgrid.predict(X_test)
print(classification_report(y_test, blr_pred))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.8s finished


{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
0.7529566360052562
              precision    recall  f1-score   support

           0       0.77      0.88      0.83      3024
           1       0.81      0.66      0.73      2306

    accuracy                           0.79      5330
   macro avg       0.79      0.77      0.78      5330
weighted avg       0.79      0.79      0.78      5330

CPU times: user 11.5 s, sys: 11.4 s, total: 22.9 s
Wall time: 10.6 s


In [8]:
%%time
#encoder
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/4'
embed = hub.KerasLayer(module_url, trainable=True, name='USE_embedding')

CPU times: user 1min, sys: 8.04 s, total: 1min 8s
Wall time: 1min 11s


In [9]:
#Embeded model
def build_model(embed):
    
    model = Sequential([
        Input(shape=[], dtype=tf.string),
        embed,
        Dense(1, activation='sigmoid')
    ])
    model.compile(Adam(2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [10]:
#Non-Embeded Model
def build_model2():
    
    model = Sequential([
        Input(shape=[21637,],),
        Dense(1, activation='sigmoid')
    ])
    model.compile(Adam(2e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [11]:
model = build_model(embed)
model.summary()

model2 = build_model2()
model2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
USE_embedding (KerasLayer)   {'outputs': (None, 512)}  147354880 
_________________________________________________________________
dense (Dense)                (None, 1)                 513       
Total params: 147,355,393
Trainable params: 147,355,393
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 21638     
Total params: 21,638
Trainable params: 21,638
Non-trainable params: 0
_________________________________________________________________


In [12]:
%%time
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_data, train_labels,
    validation_split=0.25,
    epochs=4,
    callbacks=[checkpoint],
    batch_size=32
)

Train on 5709 samples, validate on 1904 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 53min 3s, sys: 2min 14s, total: 55min 18s
Wall time: 16min 48s


In [13]:
%%time
checkpoint = ModelCheckpoint('model2.h5', monitor='val_loss', save_best_only=True)

train_history = model2.fit(
    train_vectors.toarray(), train_labels,
    validation_split=0.25,
    epochs=4,
    callbacks=[checkpoint],
    batch_size=32
)

Train on 5709 samples, validate on 1904 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
CPU times: user 8.92 s, sys: 2.11 s, total: 11 s
Wall time: 6.4 s


In [14]:
#Preds
model.load_weights('model.h5')
model2.load_weights('model2.h5')

test_pred = model.predict(test_data)
test_pred2 = model2.predict(test_vectors.toarray())
test_pred_lr = lrgrid.predict(test_vectors)
test_pred_blr = blrgrid.predict(test_vectors)

In [15]:
#Submits
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)
#F-score 0.81595
submission['target'] = test_pred2.round().astype(int)
submission.to_csv('submission2.csv', index=False)
#F-score 0.63394
submission['target'] = test_pred_lr.round().astype(int)
submission.to_csv('submissionlr.csv', index=False)
#F-score 0.77811
submission['target'] = test_pred_blr.round().astype(int)
submission.to_csv('submissionblr.csv', index=False)
#F-score 0.77402