In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split

In [5]:
# Read CSV
df_tweets = pd.read_csv('C:/Users/VovAz/OneDrive/Desktop/Проект/Disaster/train.csv')

In [6]:
# Downloading BERT
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [7]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])#reduce overfitting
l = tf.keras.layers.Dense(8, activation='LeakyReLU')(l)
l = tf.keras.layers.Dense(3, activation='softmax')(l)
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)


# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [8]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [9]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [10]:
def clean_text(text):
    text=str(text).lower() #Converts text to lowercase
    text=re.sub('\d+', '', text) #removes numbers
    text=re.sub('\[.*?\]', '', text) #removes HTML tags
    text=re.sub('https?://\S+|www\.\S+', '', text) #removes url
    text=re.sub(r"["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", "", text) #removes emojis
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text
df_tweets['clean_text']=df_tweets['text'].apply(clean_text)

X_train, X_test, y_train, y_test = train_test_split(df_tweets['clean_text'],df_tweets['target'],test_size= 0.2, stratify=df_tweets['target'])

In [11]:
model.fit(X_train, y_train, epochs=5) # due low perfmance leave 5 epochs

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x21d7a02be80>

In [12]:
model.evaluate(X_test, y_test)



[0.5108765959739685, 0.7780696153640747]

In [13]:
df_test=pd.read_csv('C:/Users/VovAz/OneDrive/Desktop/Проект/Disaster/test.csv')
df_test['clean_text']=df_test['text'].apply(clean_text)
pred = model.predict(df_test['clean_text'])



In [14]:
pred = pd.DataFrame(pred) #concat submission
submission = df_test[['id']].reset_index(drop=True)
submission['target'] = pred[0].apply(lambda x: 1 if x >= 0.5 else 0)# transform model proba to 1 and 0
submission['target']

0       1
1       0
2       1
3       1
4       1
       ..
3258    0
3259    1
3260    1
3261    1
3262    1
Name: target, Length: 3263, dtype: int64

In [15]:
submission.to_csv('C:/Users/VovAz/OneDrive/Desktop/Проект/Disaster/submission.csv', index=False)#save to CSV