# Détectez les bad buzz grace au Deeplearning

In [1]:
import warnings
warnings.simplefilter(action='ignore')

### basic libs
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import tensorflow as tf
from sklearn.manifold import TSNE
import nltk
from nltk.stem.snowball import EnglishStemmer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

done_preprocessing = 1

In [2]:
if not done_preprocessing:
    tweets_df = pd.read_csv('./dataset.csv', names=['target', 'id', 'date', 'flag', 'user', 'text'], encoding='latin-1')
    tweets_df = tweets_df[['target', 'text']]
    print(tweets_df.head())


In [3]:
if not done_preprocessing:
    lbls = ['Negative', '', '', '', 'Positive']
    sns.histplot(x=tweets_df["target"].apply(lambda x: lbls[x]))
    plt.show()

### Text preproccessing

In [5]:
test_size = 100000
val_size = 100000

if not done_preprocessing:
    stemmer = EnglishStemmer()
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    stop_words = [stemmer.stem(w) for w in list(nltk.corpus.stopwords.words('english'))]

    def clean_up(text):
        text = text.split()
        for exclude in ['@', '/']:
            text = [w for w in text if not exclude in w]
        text = ' '.join(text)
        tokens = tokenizer.tokenize(text)
        tokens = [stemmer.stem(w) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        return " ".join(tokens)

    def tokenize_comments():
        tweets_df["text"] = tweets_df["text"].apply(clean_up)

    def get_most_used():
        freqs = {}
        tokenized_text = tweets_df["text"].to_list()
        for comment in tokenized_text:
            if isinstance(comment, str):
                comment = comment.split()
                for word in list(set(comment)):
                    if word in freqs:
                        freqs[word] += 1
                    else:
                        freqs[word] = 1

        freqs_list = [(x, freqs[x]/tweets_df.shape[0])for x in freqs]
        freqs_list = sorted(freqs_list, key=lambda x:x[1])[::-1]

        out = [x[0]for x in freqs_list[:100]]

        return out 

    def clear_words(tokens):
        if isinstance(tokens, str):
            return " ".join([w for w in tokens.split() if not w in n_frequent])
        else:
            return ""

    def del_most_freq():
        global tweets_df
        tweets_df = tweets_df.dropna()
        tweets_df["text"] = tweets_df["text"].apply(clear_words)          

    tokenize_comments()    

    n_frequent = get_most_used()

    del_most_freq()
    
    test_df = tweets_df.head(test_size)
    tweets_df = tweets_df.tail(tweets_df.shape[0] - test_size)
    val_df = tweets_df.head(val_size)
    tweets_df = tweets_df.tail(tweets_df.shape[0] - val_size)
    test_df.to_csv('text_test.csv', index=False)
    val_df.to_csv('text_val.csv', index=False)
    tweets_df.to_csv('text_train.csv', index=False)

In [6]:
if done_preprocessing:
    train_df = pd.read_csv('text_train.csv')
    train_df = train_df.sample(frac=1)
    test_df = pd.read_csv('text_test.csv')
    val_df = pd.read_csv('text_val.csv')
    
train_df.head()

Unnamed: 0,target,text
976607,0,ear hurt throat hurt bitch woe eat someth
1192017,4,moment univers usual air canada seat sale quit...
490485,4,incred video
822186,4,ok ur anywher near sit class swim parti lonngg...
1372297,0,confick opossum didnt
