# Détectez les bad buzz grace au Deeplearning

In [1]:
import warnings
warnings.simplefilter(action='ignore')
# basic libs
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import unidecode
import gc
from io import StringIO
from timeit import default_timer as timer
# text preprocessing
import nltk
from sklearn.feature_extraction.text import CountVectorizer
# deeplearning
import tensorflow as tf
import gensim
from gensim.models import Word2Vec

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

trained = 0

done_preprocessing = 1

if not done_preprocessing:
    trained = 2

### Text preproccessing

On commence par tokéniser le texte (on ne garde que les mots, en supprimant la pontuation, les liens, les nombres ...).
On retire également les "stop-words", c'est-à-dire tous les articles, déterminants, pronoms et mots de liaison.

On trie le vocabulaire en utilisant comme référence les mots de l'embedding glove entraîné par l'Unversité de Stanford sur un dataset également issu de twitter. Ce traitement permet d'éliminer efficacement les twits pauvres, en se passant du moins possible de mots porteurs de sens.

On applique ensuite deux traitements différents séparément pour comparer leurs performances: la lemmatisation (qui garde seulement la forme canonique des mots, par exemple le féminin singulier) et le stemming (qui garde uniquement le radical des mots). 

In [23]:
test_size = 20000
val_size = 20000

max_words = 25

NUM_WORDS = 71243

In [3]:
if not trained or not done_preprocessing:
    word_vectors = gensim.models.KeyedVectors.load_word2vec_format('./models/GoogleNews-vectors-negative300.bin', binary=True)
    EMBEDDING_DIM=300

In [4]:
def update_target(tgt):
    return 1 if tgt else 0

def clean_up(text):
    text = text.lower()
    for exclude in ['&quot;', '&amp;']:
        text = text.replace(exclude, ' ')
    text = text.replace('-', '')
    text = text.split()
    for exclude in ['@', '/', 'www']:
        text = [w for w in text if not exclude in w]
    text = ' '.join(text)
    text = unidecode.unidecode(text)
    tokens = tokenizer.tokenize(text)
    tokens = [w for w in tokens if not w in stop_words and w in word_vectors]
    if len(tokens) > max_words:
        tokens = tokens[:max_words]
    return ' '.join(tokens) if len(tokens) > 1 else ''
    
if not done_preprocessing:
    train_df = pd.read_csv('./data/dataset.csv', names=['target', 'id', 'date', 'flag', 'user', 'text'], encoding='latin-1')
    train_df = train_df[['target', 'text']]
    
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    stop_words = list(nltk.corpus.stopwords.words('english'))
    
    train_df["text"] = train_df["text"].apply(clean_up)
    
    train_df = train_df[train_df['text'] != ""]
    
    vanilla_vectoriser = CountVectorizer()
    vanilla_vectoriser.fit(train_df["text"].to_list())
    joblib.dump(vanilla_vectoriser, "./models/w2v_vectoriser.joblib")
    
    train_df["target"] = train_df["target"].apply(update_target)
    
    train_df = train_df.sample(frac=1)
    
    test_df = train_df.head(test_size)
    train_df = train_df.tail(train_df.shape[0] - test_size)
    val_df = train_df.head(val_size)
    train_df = train_df.tail(train_df.shape[0] - val_size)
    test_df.to_csv('./data/w2v_text_test.csv', index=False)
    val_df.to_csv('./data/w2v_text_val.csv', index=False)
    train_df.to_csv('./data/w2v_text_train.csv', index=False)
    
    # summary
    
    print(f"Found {len(vanilla_vectoriser.vocabulary_)} unique tokens in vanilla text")
    print(f"Final main dataframe size: {train_df.shape[0]}")
else:
    print("Found 71243 unique tokens in vanilla text\nFinal main dataframe size: 1476873")

Found 71243 unique tokens in vanilla text
Final main dataframe size: 1476873


In [5]:
def save_vocabulary():
    with open("./azure/w2v_vocabulary.txt", 'w+') as f:
        vanilla_vectoriser = joblib.load("./models/w2v_vectoriser.joblib")
        for w in vanilla_vectoriser.vocabulary_:
            f.write(w)
            f.write('\n')

if not done_preprocessing:            
    save_vocabulary()

In [6]:
train_df = pd.read_csv(f'./data/w2v_text_train.csv')
test_df = pd.read_csv(f'./data/w2v_text_test.csv')
val_df = pd.read_csv(f'./data/w2v_text_val.csv')
    
train_df.head()

Unnamed: 0,target,text
0,1,laying dark thinking
1,1,got watch thts sooooo sweet
2,1,outside working garden office
3,0,one worse days ive wanna go home sleep
4,1,watching enough pretty intense haha


In [7]:
results_file = f'./results.csv'

if not os.path.isfile(results_file):
    results = pd.DataFrame(columns=['model_name', 'accuracy', 'time', 'loss'])
    results.to_csv(results_file, index=False)

def add_result(name, accuracy, time, loss=None):
    result_df = pd.read_csv(results_file)
    result_df = result_df[result_df["model_name"] != name]
    result_df = result_df.append({"model_name": name, "accuracy": accuracy, "time": time, "loss": loss}, ignore_index=True)
    result_df.to_csv(results_file, index=False)
    
def get_results():
    result_df = pd.read_csv(results_file)
    print(result_df.head(20))

### LSTM

In [17]:
BUFFER_SIZE = 1000

AUTOTUNE = tf.data.AUTOTUNE

def process_model(model, hist, name, test_dataset, time):
    # get results
    history = pd.DataFrame(data=hist.history)
    model_performance = model.evaluate(test_dataset)
    model_pred = model.predict(test_dataset)
    # save results
    model.save(f"./models/{name}.h5")
    history.to_csv(f'./models/{name}_history.csv', index=False)
    model.save_weights(f'./models/{name}_weights.h5')
    add_result(name, model_performance[1], time, model_performance[0])


def display_learning_curves(hst):
    fig, ax = plt.subplots(1, 2, figsize=(20, 3))
    ax = ax.ravel()

    for i, met in enumerate(['accuracy', 'loss']):
        ax[i].plot(hst[met])
        ax[i].plot(hst['val_' + met])
        ax[i].set_title('Model {}'.format(met))
        ax[i].set_xlabel('epochs')
        ax[i].set_ylabel(met)
        ax[i].legend(['train', 'val'])
    
    plt.show()

In [18]:
n_epochs = 20

batch_size = 128

data_passes = 2

steps_per_epoch = (train_df.shape[0] * data_passes) // (batch_size * n_epochs)

In [19]:
# get targets
y_train = np.asarray(train_df["target"].to_list())
y_test = np.asarray(test_df["target"].to_list())
y_val = np.asarray(val_df["target"].to_list())

In [20]:
if not trained:
    X_train = train_df["text"].to_list()
    X_test = test_df["text"].to_list()
    X_val = val_df["text"].to_list()

    def encoder(data_train, data_test, data_val):
        tokenizer = tf.keras.preprocessing.text.Tokenizer()
        tokenizer.fit_on_texts(data_train)
        encoded_train = tokenizer.texts_to_sequences(data_train)
        encoded_test = tokenizer.texts_to_sequences(data_test)
        encoded_val = tokenizer.texts_to_sequences(data_val)
        max_len = max([len(s.split()) for s in data_train])
        train_data = np.asarray(tf.keras.preprocessing.sequence.pad_sequences(encoded_train, maxlen=max_len, padding='post'))
        test_data = np.asarray(tf.keras.preprocessing.sequence.pad_sequences(encoded_test, maxlen=max_len, padding='post'))
        val_data = np.asarray(tf.keras.preprocessing.sequence.pad_sequences(encoded_val, maxlen=max_len, padding='post'))

        return train_data, test_data, val_data, max_len, tokenizer

    X_train, X_test, X_val, vanilla_input_dim, vanilla_tokenizer = encoder(X_train, X_test, X_val)
    
    joblib.dump(vanilla_tokenizer, "./azure/w2v_tokenizer.joblib")
      
    # make tf datasets
    train_ds = tf.data.Dataset.from_tensor_slices((X_train, tf.cast(y_train, tf.int32))).shuffle(BUFFER_SIZE)
    test_ds = tf.data.Dataset.from_tensor_slices((X_test, tf.cast(y_test, tf.int32)))
    val_ds = tf.data.Dataset.from_tensor_slices((X_val, tf.cast(y_val, tf.int32))).shuffle(BUFFER_SIZE)
    # preprocess datasets
    train_ds = train_ds.repeat()
    train_ds = train_ds.batch(batch_size)
    train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.batch(batch_size)
    val_ds = val_ds.batch(batch_size)
    val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)
    
    print(f"Sequences encoded on {vanilla_input_dim} words")
    

Sequences encoded on 25 words


In [24]:
if not trained:
    word_index = vanilla_tokenizer.word_index
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i>=NUM_WORDS:
            continue
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

    embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                trainable=True)

In [25]:
model_name = "w2v_lstm_nn"

def train_w2v_lstm_model():
    
    w2v_model = tf.keras.models.Sequential()
    w2v_model.add(embedding_layer) 
    w2v_model.add(tf.keras.layers.LSTM(128,return_sequences=True,dropout=0.2))
    w2v_model.add(tf.keras.layers.GlobalMaxPooling1D())
    w2v_model.add(tf.keras.layers.Dense(64,activation='relu')) 
    w2v_model.add(tf.keras.layers.Dense(1,activation='sigmoid')) 

    print(w2v_model.summary())
    
    w2v_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    t0 = timer()
    hist = w2v_model.fit(train_ds,
                         epochs=n_epochs,
                         verbose=2,
                         validation_data=val_ds,
                         batch_size=batch_size,
                         steps_per_epoch=steps_per_epoch)
    t1 = timer() - t0
    process_model(w2v_model, hist, model_name, test_ds, t1)


if not trained:
    train_w2v_lstm_model()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         21233100  
_________________________________________________________________
lstm (LSTM)                  (None, None, 128)         219648    
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 21,461,069
Trainable params: 21,461,069
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
1153/1153 - 33s - loss: 0.4943 - accuracy: 0.7574 - val_loss: 0.4733 - val_accuracy: 0.7736
Ep

In [26]:
if trained==1:
    history = pd.read_csv(f'./models/{model_name}_history.csv')
    display_learning_curves(history)

### Display results

In [27]:
get_results()

            model_name  accuracy         time      loss
0   random_forest       0.74330   4673.442075 NaN      
1   random_forest_stem  0.74460   3145.852605 NaN      
2   random_forest_lem   0.74370   4217.279929 NaN      
3   vanilla_nn          0.76795   4371.061097  0.484366
4   stemming_nn         0.76415   3354.503793  0.491544
5   lemmatization_nn    0.76780   3995.933181  0.485960
6   glove_cnn           0.78720   652.948430   0.451748
7   glove_lstm_nn       0.79640   772.612858   0.444426
8   glove_lstm_cnn      0.78160   814.337397   0.482108
9   ASTA                0.72600  NaN          NaN      
10  w2v_lstm_nn         0.79690   611.238391   0.434156
