In [67]:
import pandas as pd
import nltk
import torch
import tensorflow as tf
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn import utils
import simpletransformers
import seaborn as sns
import wandb
from simpletransformers.classification import ClassificationModel



In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

## EDA

In [None]:
df_train.head(2)

In [None]:
df_train.isnull().sum()

In [None]:
df_train.dtypes

In [None]:
df_train['target'].value_counts()

In [None]:
df_train.drop(['id', 'keyword', 'location'], axis=1, inplace=True)
df_test.drop(['id', 'keyword', 'location'], axis=1, inplace=True)

In [None]:
t_len = list(df_train['text'].apply(lambda x: len(x)))
print(max(t_len), min(t_len))

In [None]:
sns.countplot(df_train['target'])

## BERT

In [None]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel(model_type='distilbert', model_name='distilbert-base-cased', 
                            use_cuda=False, num_labels=2, args=model_args)

In [None]:
%%time
model.train_model(train_df)

## Sequence model

In [3]:
labels = df_train['target']

In [59]:
leng = [len(x) for x in df_train['text']]
max(leng)

157

In [18]:
def create_tokenizer(df):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=100000, 
                                                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                                                      lower=True,
                                                      split=' ', 
                                                      char_level=False, 
                                                      oov_token=None, 
                                                      document_count=0)

    tokenizer.fit_on_texts(df['text'])
    return tokenizer

def create_tokens(tokenizer, df):
    
    seq = np.array(tokenizer.texts_to_sequences(df['text']))
    vector = tf.keras.preprocessing.sequence.pad_sequences(seq, padding='post')
    vector = np.reshape(vector, (vector.shape[0], 1, vector.shape[1])).astype(float)
    
    return vector

In [19]:
tokenizer = create_tokenizer(df_train)
train_vector = create_tokens(tokenizer, df_train)
test_vector = create_tokens(tokenizer, df_test)
train_y = np.array(labels).astype(int)

In [20]:
test_vector = np.resize(test_vector,(test_vector.shape[0], 1, train_vector.shape[2]))

In [22]:
class_weights = utils.compute_class_weight('balanced', np.unique(train_y), train_y)
class_weights = dict(enumerate(class_weights))

In [44]:
tr_x, va_x, tr_y, va_y = train_test_split(train_vector, train_y, stratify=train_y, random_state=342)

In [24]:
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape = tr_x.shape[1:]),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l2(0.00001))
])

In [25]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.RMSprop(1e-4, momentum=0.8),
              metrics=['accuracy'])

In [26]:
history = model.fit(tr_x, 
                    tr_y, 
                    epochs=100, 
                    class_weight=class_weights,
                    validation_data=(va_x, va_y), verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [36]:
test_preds = model.predict_classes(test_vector)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [39]:
test_preds_df = pd.DataFrame(test_preds)

In [52]:
test_preds_df.columns = ['Values']
test_preds_df['Values'].value_counts()

0    2037
1    1226
Name: Values, dtype: int64

In [60]:
#test_preds_df

In [57]:
df_test['target']=test_preds_df['Values']
df_save = df_test[['id', 'target']]
df_save.to_csv('result.csv',index=None)

## Zero shot classifier