In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [None]:
train = pd.read_csv('./train.csv')
train.head()

In [None]:
train.shape

## Analisis Exploratorio

In [None]:
x = train.target.value_counts()
sns.barplot(x.index, x)
plt.gca().set_ylabel('samples')

Existen mas tweets que no hacen referencia a desastre (0) que aquellos que si hacen referencia a un desastre (1)

### Cantidad de caracteres en un tweet

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
tweet_length = train[train['target']==1]['text'].str.len()
ax1.hist(tweet_length, color='red')
ax1.set_title('desastres')
tweet_length=train[train['target']==0]['text'].str.len()
ax2.hist(tweet_length, color='blue')
ax2.set_title('no desastres')
fig.suptitle('Cantidad de caracteres del tweet')
plt.show()

### Cantidad de palabras en el tweet

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_length=train[train['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_length,color='red')
ax1.set_title('desastres')
tweet_length=train[train['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_length,color='blue')
ax2.set_title('no desastres')
fig.suptitle('Cantidad de palabras')
plt.show()

### Promedio de longitud de las palabras

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
word=train[train['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('desastres')
word=train[train['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='blue')
ax2.set_title('no desastres')
fig.suptitle('Promedio de largo de las palabras en los tweets')

In [None]:
def create_corpus(target):
    corpus=[]
    
    for x in train[train['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

### Palabra vacias (stopwords) en los tweets 

#### Para no desastres

In [None]:
corpus=create_corpus(0)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 

In [None]:
x,y=zip(*top)
plt.bar(x,y, color='blue')

#### Tweets de desastres

In [None]:
corpus=create_corpus(1)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]

In [None]:
x,y=zip(*top)
plt.bar(x,y)

### Analisis de puntuacion

#### Desastres

In [None]:
plt.figure(figsize=(10,5))
corpus=create_corpus(1)

dic=defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i]+=1

In [None]:
x,y=zip(*dic.items())
plt.bar(x,y)

#### No desastres

In [None]:
plt.figure(figsize=(10,5))
corpus=create_corpus(0)

dic=defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i]+=1

In [None]:
x,y=zip(*dic.items())
plt.bar(x,y,color='blue')

### Palabras comunes

In [None]:
counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:40]:
    if (word not in stop) :
        x.append(word)
        y.append(count)

sns.barplot(x=y,y=x)

#### Unigrama

In [None]:
def get_top_tweet_unigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(1, 1)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_unigrams=get_top_tweet_unigrams(train['text'])[:10]
x,y=map(list,zip(*top_tweet_unigrams))
sns.barplot(x=y,y=x)

#### Bigrama (n=2)

In [None]:
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigrams=get_top_tweet_bigrams(train['text'])[:10]
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)

#### Trigrama

In [None]:
def get_top_tweet_trigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_trigrams=get_top_tweet_trigrams(train['text'])[:10]
x,y=map(list,zip(*top_tweet_trigrams))
sns.barplot(x=y,y=x)

#### n = 1-5

In [None]:
def get_top_tweet_ngrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(1, 5)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_ngrams=get_top_tweet_ngrams(train['text'])[:10]
x,y=map(list,zip(*top_tweet_ngrams))
sns.barplot(x=y,y=x)

## Limpieza de los datos

### Quitar URLs

In [None]:
def remove_URL(text):
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r'', text)

In [None]:
train['text'] = train['text'].apply(lambda x : remove_URL(x))

### Quitar tags HTML

In [None]:
def remove_HTML(text):
  html = re.compile(r'<.*?>')
  return html.sub(r'', text)

In [None]:
train['text'] = train['text'].apply(lambda x : remove_HTML(x))

### Quitar emojis

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
train['text'] = train['text'].apply(lambda x : remove_emoji(x))

### Quitar puntuacion

In [None]:
def remove_point(text):
  table = str.maketrans('', '', string.punctuation)
  return text.translate(table)

In [None]:
train['text'] = train['text'].apply(lambda x : remove_point(x))

### Corregir errores de ortografia

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spelling(text):
  corrected_str = []
  misspelled_str = spell.unknown(text.split())
  for word in text.split():
    if word in misspelled_str:
      corrected_str.append(spell.correction(word))
    else:
      corrected_str.append(word)
  return " ".join(corrected_str)

In [None]:
# train['text'] = train['text'].apply(lambda x : correct_spelling(x))