# Détectez les bad buzz grace au Deeplearning

In [1]:
import warnings
warnings.simplefilter(action='ignore')

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import tensorflow as tf
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import joblib

pyLDAvis.enable_notebook()

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)


### Text preproccessing

On commence par tokéniser le texte (on ne garde que les mots, en supprimant la pontuation, les liens, les nombres ...).
On retire également les "stop-words", c'est-à-dire tous les articles, déterminants, pronoms et mots de liaison.
On applique ensuite deux traitements différents séparément pour comparer leurs performances: la lemmatisation (qui garde seulement la forme canonique des mots, par exemple le féminin singulier) et le stemming (qui garde uniquement le radical des mots). 

In [2]:
done_preprocessing = 1

n_words = 5000 # warning: must be higher later

test_size = 100000
val_size = 100000

lbls = ['Negative', '', '', '', 'Positive']

if not done_preprocessing:
    train_df = pd.read_csv('./data/dataset.csv', names=['target', 'id', 'date', 'flag', 'user', 'text'], encoding='latin-1')
    train_df = train_df[['target', 'text']]
    
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    stop_words = list(nltk.corpus.stopwords.words('english'))
    
    def clean_up(text):
        text = ''.join(c for c in text if not c.isdigit()).replace('\n', '').lower()
        text = text.split()
        for exclude in ['@', '/']:
            text = [w for w in text if not exclude in w]
        text = ' '.join(text)
        tokens = tokenizer.tokenize(text)
        tokens = [w for w in tokens if not w in stop_words]
        return " ".join(tokens)
    
    train_df["text"] = train_df["text"].apply(clean_up)
    
    train_df = train_df[train_df['text'] != ""]
    
    train_df.to_csv('./data/preprocessed.csv', index=False)

In [3]:
if not done_preprocessing:
    
    train_df = pd.read_csv('./data/preprocessed.csv')
    
    stemmer = EnglishStemmer()
    lemmatizer = WordNetLemmatizer()

    def stem(text):
        return [stemmer.stem(w) for w in text.split()]

    def lem(text):
        return [lemmatizer.lemmatize(w) for w in text.split()]

    # stemming
    train_df["text_stem"] = train_df["text"].apply(stem)
    s_dictionary = gensim.corpora.Dictionary(train_df["text_stem"].tolist())
    print(f"Found {len(s_dictionary)} unique tokens after stemming")
    s_dictionary.filter_extremes(no_below=1000, no_above=0.4, keep_n=n_words)
    s_words = s_dictionary.token2id
    
    # lemmatization
    train_df["text_lem"] = train_df["text"].apply(lem)
    l_dictionary = gensim.corpora.Dictionary(train_df["text_lem"].tolist())
    print(f"Found {len(l_dictionary)} unique tokens after lemmatisation")
    l_dictionary.filter_extremes(no_below=1000, no_above=0.4, keep_n=n_words)
    l_words = l_dictionary.token2id
    
    def clean_stem(tokens):
        return " ".join([t for t in tokens if t in s_words])
        
    def clean_lem(tokens):
        return " ".join([t for t in tokens if t in l_words])

    train_df["text_stem"] = train_df["text_stem"].apply(clean_stem)
    train_df["text_lem"] = train_df["text_lem"].apply(clean_lem)

    train_df = train_df[train_df['text_stem'] != ""]
    train_df = train_df[train_df['text_lem'] != ""]
    
    train_df = train_df.sample(frac=1)
    
    print(f"Final dataframe size: {train_df.shape[0]}")
    
    test_df = train_df.head(test_size)
    train_df = train_df.tail(train_df.shape[0] - test_size)
    val_df = train_df.head(val_size)
    train_df = train_df.tail(train_df.shape[0] - val_size)
    test_df.to_csv('./data/text_test.csv', index=False)
    val_df.to_csv('./data/text_val.csv', index=False)
    train_df.to_csv('./data/text_train.csv', index=False)
else:
    print("Found 225037 unique tokens after stemming\nFound 259292 unique tokens after lemmatisation\nFinal dataframe size: 1560604")

Found 225037 unique tokens after stemming
Found 259292 unique tokens after lemmatisation
Final dataframe size: 1560604


In [4]:
if done_preprocessing:
    train_df = pd.read_csv('./data/text_train.csv')
    test_df = pd.read_csv('./data/text_test.csv')
    val_df = pd.read_csv('./data/text_val.csv')
    
train_df.head()

Unnamed: 0,target,text,text_stem,text_lem
0,4,best day watched brother kick top post footy m...,best day watch brother kick top post miss p,best day watched brother kick top post miss p
1,0,fall back asleep,fall back asleep,fall back asleep
2,4,heading work,head work,heading work
3,0,wow wish made bad girls club longer fucking mi...,wow wish made bad girl club longer fuck miss,wow wish made bad girl club longer fucking miss
4,4,dunt wud prefere watch mark perform anyday pms...,prefer watch mark perform u play nite,watch mark u playing nite


### Baseline

Le modèle de base servant de références aux autres modèles est une simple régression (random forest) appliquée sur les bag-of-words.

In [5]:
trained = 1

debug = 1

if debug:
    train_df = train_df.head(10000)

In [6]:
vectoriser = CountVectorizer(max_features=n_words)
train_vectorized_stem = vectoriser.fit_transform(train_df["text_stem"].to_list()).toarray()
test_vectorized_stem = vectoriser.fit_transform(test_df["text_stem"].to_list()).toarray()
train_vectorized_lem = vectoriser.fit_transform(train_df["text_lem"].to_list()).toarray()
test_vectorized_lem = vectoriser.fit_transform(test_df["text_lem"].to_list()).toarray()
target = train_df["target"].to_list()
test_target = test_df["target"].to_list()

#### With stemming

In [7]:
if not debug:
    rfc=RandomForestClassifier(random_state=42)
    param_grid = { 
        'n_estimators': [100, 1000],
        'max_features': ['sqrt', 'log2']
    }
    forest_grid = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=10, scoring="accuracy", n_jobs=-1)
    forest_grid.fit(train_vectorized_stem, target)
    print(forest_grid.best_params_)
    print("accuracy :", forest_grid.best_score_)
    forest_params = forest_grid.best_params_
else:
    forest_params = {'max_features': 'sqrt', 'n_estimators': 1000}

In [9]:
if trained:
    random_forest_stem = joblib.load("./random_forest_stem.joblib")
else:
    random_forest_stem = RandomForestClassifier(
                    n_estimators=forest_params['n_estimators'],
                    random_state=50,
                    max_features=forest_params['max_features'],
                    verbose=False,
                    n_jobs=-1)
    random_forest_stem.fit(train_vectorized_stem, target)
    joblib.dump(random_forest_stem, "./random_forest_stem.joblib")

In [10]:
random_forest_stem.score(test_vectorized_stem, test_target)

0.71336

#### With lemmatization

In [11]:
if not debug:
    rfc=RandomForestClassifier(random_state=42)
    param_grid = { 
        'n_estimators': [100, 1000],
        'max_features': ['sqrt', 'log2']
    }
    forest_grid = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=10, scoring="accuracy", n_jobs=-1)
    forest_grid.fit(train_vectorized_lem, target)
    print(forest_grid.best_params_)
    print("accuracy :", forest_grid.best_score_)
    forest_params = forest_grid.best_params_
else:
    forest_params = {'max_features': 'sqrt', 'n_estimators': 1000}

In [12]:
if trained:
    random_forest_lem = joblib.load("./random_forest_lem.joblib")
else:
    random_forest_lem = RandomForestClassifier(
                    n_estimators=forest_params['n_estimators'],
                    random_state=50,
                    max_features=forest_params['max_features'],
                    verbose=False,
                    n_jobs=-1)
    random_forest_lem.fit(train_vectorized_lem, target)
    joblib.dump(random_forest_lem, "./random_forest_lem.joblib")
    
random_forest_lem.score(test_vectorized_lem, test_target)

0.7108