In [None]:
from google.colab import drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Disaster tweets : 

## Description du concours

Twitter est devenu un canal de communication important en cas d'urgence.
L'omniprésence des smartphones permet aux gens d'annoncer une urgence qu'ils observent en temps réel. Pour cette raison, davantage d'agences sont intéressées par la surveillance programmatique de Twitter (c'est-à-dire les organisations de secours en cas de catastrophe et les agences de presse).

Mais, il n'est pas toujours clair si les mots d'une personne annoncent réellement une catastrophe. 

**Prenons cet exemple** : 
 
L'auteur utilise explicitement le mot "ABLAZE" mais c'est une métaphore dans ce contexte. Son utilisation est claire pour un être humain tout de suite, surtout avec l'aide visuelle. Mais c'est moins clair pour une machine.

Dans ce concours, vous êtes mis au défi de créer un modèle d'apprentissage automatique qui prédit quels Tweets concernent de véritables catastrophes et lesquels ne le sont pas. 


## 1) Présentation dataset "train" et récupération du modèle BERT et ses modules

In [None]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
dico_isnull = {c : len(train[train[c].isnull()== True]) for c in train.columns}
dico_isnull

{'id': 0, 'keyword': 61, 'location': 2533, 'text': 0, 'target': 0}

In [None]:
import tensorflow_hub as hub

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"


bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
!git clone https://github.com/tkeldenich/BERT_Easy_Implementation &> /dev/null

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip install sentencepiece &> /dev/null

In [None]:
!pip install tokenization

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 2) Preprocessing BERT


In [None]:
import tensorflow as tf

In [None]:
import tokenization
import numpy as np



def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
!pip install bert-for-tf2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from bert import bert_tokenization

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(train.text.values, tokenizer, max_len=100)

train_labels = train.target.values

## 3) Création du modèle de prédiction associé à BERT


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint


def build_model(bert_layer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    drop1 = Dropout(0.1)(clf_output)
    out = Dense(1, activation='sigmoid')(drop1)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model(bert_layer, max_len=100)


The `lr` argument is deprecated, use `learning_rate` instead.



## 4) Application du modèle de prédiction sur la data train

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-{epoch:00d}.ckpt" # the path where checkpoint files
# will be stored
# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1,
    save_weights_only=True,
   save_freq=30)


model.save_weights(checkpoint_path.format(epoch=4))



In [None]:
model.load_weights("/content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f0e84f51150>

In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=4, 
    batch_size= 32,callbacks=[cp_callback])

Epoch 1/4
 29/191 [===>..........................] - ETA: 1:39 - loss: 0.6571 - accuracy: 0.6099
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-1.ckpt
Epoch 2/4
 18/191 [=>............................] - ETA: 1:53 - loss: 0.4018 - accuracy: 0.8281
Epoch 2: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-2.ckpt
Epoch 2: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_tweet/cp-2.ckpt
Epoch 2: saving model to /content/drive/MyDrive/Colab Notebooks/training_1_twe

In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=1, 
    batch_size= 32,callbacks=[cp_callback])


 29/191 [===>..........................] - ETA: 1:41 - loss: 0.4240 - accuracy: 0.8157
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-1.ckpt
Epoch 1: saving model to /content/drive/MyDrive/Colab Notebooks/training_2_tweet/cp-1.ckpt


## 5) Evaluation du modèle de prédiction


In [None]:
model.evaluate(train_input, train_labels)



[0.3554011583328247, 0.8495993614196777]

## 6) Application du modèle à la data test



In [None]:
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
dico_isnull_test = {c : len(test[test[c].isnull()== True]) for c in test.columns}
dico_isnull_test

{'id': 0, 'keyword': 26, 'location': 1105, 'text': 0}

###  a) Preprocessing du text de la data test

In [None]:

test_input = bert_encode(test.text.values, tokenizer, max_len=100)


###  b) Prédiction: si le tweet annonce une catastrophe (label : 1) ou pas ( label : 0)!!!

In [None]:
 pred = model.predict(test_input)



In [None]:
pred

array([[0.8287145 ],
       [0.49941424],
       [0.8506349 ],
       ...,
       [0.94241273],
       [0.8660333 ],
       [0.87788415]], dtype=float32)

In [None]:
test["bert_prediction"]= pred

In [None]:
test

Unnamed: 0,id,keyword,location,text,bert_prediction
0,0,,,Just happened a terrible car crash,0.828714
1,2,,,"Heard about #earthquake is different cities, s...",0.499414
2,3,,,"there is a forest fire at spot pond, geese are...",0.850635
3,9,,,Apocalypse lighting. #Spokane #wildfires,0.966417
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0.881805
...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0.227135
3259,10865,,,Storm in RI worse than last hurricane. My city...,0.975199
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,0.942413
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,0.866033


#### Convertisseur de la probabilité en labels de prédiction

In [None]:
def prediction(x):
  if x > 0.50:
    x = 1
  else: 
    x=0
  return x

In [None]:
pred_labels_bert =  test["bert_prediction"].apply(lambda x: prediction(x))
pred_labels_bert 

0       1
1       0
2       1
3       1
4       1
       ..
3258    0
3259    1
3260    1
3261    1
3262    1
Name: bert_prediction, Length: 3263, dtype: int64

In [None]:
test["target"] = pred_labels_bert 

In [None]:
test

Unnamed: 0,id,keyword,location,text,bert_prediction,target
0,0,,,Just happened a terrible car crash,0.828714,1
1,2,,,"Heard about #earthquake is different cities, s...",0.499414,0
2,3,,,"there is a forest fire at spot pond, geese are...",0.850635,1
3,9,,,Apocalypse lighting. #Spokane #wildfires,0.966417,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0.881805,1
...,...,...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0.227135,0
3259,10865,,,Storm in RI worse than last hurricane. My city...,0.975199,1
3260,10868,,,Green Line derailment in Chicago http://t.co/U...,0.942413,1
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,0.866033,1


## 7) Soumission des résultats sous forme csv

In [None]:
sample_submission_bert = test[["id","target"]]
sample_submission_bert

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
sample_submission_bert.to_csv('/content/drive/MyDrive/Colab Notebooks/sample_submission_bert.csv',index = False) 

In [None]:
pd.read_csv("/content/drive/MyDrive/Colab Notebooks/sample_submission_bert.csv")

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


## 8) Sauvegarde du modèle

In [None]:
model.save("/content/drive/MyDrive/Colab Notebooks/disaster_tweets.h2")

