In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
print(train.head())
# print(train['keyword'].value_counts(), train['location'].value_counts())

In [None]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
train.info()

In [None]:
print(train['location'].value_counts())
print(train['keyword'].value_counts())

In [None]:
def preprocess(text):
    text=text.lower()
    text = re.sub(r'[^\w\s]', '',text)
    text = re.sub(r'\d+', '',text)
    text = re.sub('https?:\/\/t.co\/[A-Za-z0-9]+', '', text)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text=word_tokenize(text)
    stop_words= stopwords.words('english')
    text= [i for i in text if i not in stop_words]
    return text

In [None]:
train['tokenized'] = train.text.apply(preprocess)
test['tokenized'] = test.text.apply(preprocess)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

In [None]:
tk=Tokenizer()
train_text = train.tokenized
train_labels = train.target
test_text = test.tokenized

tk.fit_on_texts(train_text)
train_text = tk.texts_to_sequences(train_text)
test_text = tk.texts_to_sequences(test_text)

In [None]:
maxlen= np.max([len(i) for i in train_text])
train_text = tf.keras.preprocessing.sequence.pad_sequences(
            train_text,
            padding='post',
            truncating='post',
            maxlen= maxlen)

test_text = tf.keras.preprocessing.sequence.pad_sequences(
            test_text,
            padding='post',
            truncating='post',
            maxlen= maxlen)

In [None]:
train_text=pd.DataFrame(train_text)
test_text=pd.DataFrame(test_text)
train_text=train_text.fillna({'keyword':''})
test_text=test_text.fillna({'keyword':''})

In [None]:
train_text['keyword']=train['keyword']
test_text['keyword']=test['keyword']
train_text=pd.get_dummies(train_text, columns=['keyword'])
test_text=pd.get_dummies(test_text, columns=['keyword'])


In [None]:
from sklearn.model_selection import train_test_split
# from sklearn.tree import DecisionTreeClassifier as RFC
from sklearn.metrics import f1_score
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

In [None]:
X_train,X_test, y_train, y_test = train_test_split(train_text, train_labels, test_size=0.2, random_state=20)

# VOC_SIZE = len(tok.index_word) + 1
EPOCHS = 100
BATCH_SIZE = 512
UNITS = 64

model = models.Sequential([
            layers.Dense(512, activation = 'relu'),
            layers.Dropout(0.2),
            layers.Dense(256, activation= 'relu'),
            layers.Dropout(0.2),
            layers.Dense(128,activation= 'relu'),
            layers.Dense(1,activation='sigmoid')
])
early_stopping= EarlyStopping(patience=13,verbose=1)
checkpoint = ModelCheckpoint('model.h5', save_best_only=True, verbose=1)
lr_reduce = ReduceLROnPlateau(patience=5, verbose=1)

model.compile(
    optimizer=tf.keras.optimizers.Adam(amsgrad=True),
    loss='binary_crossentropy',
    metrics=['acc']
    )

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping, checkpoint, lr_reduce],
    verbose=0
)
y_test = model.predict_classes(X_text)
y=model.predict_classes(test_text)
# model1= RFC(random_state=0)
# model1.fit(X_train, y_train)
# y=model1.predict(X_test)
# print(2, f1_score(y_test,y))

# model2= RFC()
# model2.fit(X_train, y_train)
# y=model2.predict(X_test)
# print(3, f1_score(y_test,y))


In [None]:
y_test = model.predict_classes(X_test)
y=model.predict_classes(test_text)
print(y_test.shape, y.shape)

In [None]:
np.unique(pred, return_counts=True)
print(y.shape, test['id'].shape)


In [None]:
dic={'id':test['id'],'target':y[:]}
dic=pd.DataFrame(dic)
dic.to_csv("./out.csv",index=False)