### Importing required Libraries.  
*필요한 라이브러리 가져오기*

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam



In [None]:
import os
#os.listdir('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt')

## Loading the data and getting basic idea  
*데이터 로드 및 기본 아이디어 얻기*

In [None]:
tweet= pd.read_csv('../input/nlp-getting-started/train.csv')
test=pd.read_csv('../input/nlp-getting-started/test.csv')
tweet.head(3)

In [None]:
print('There are {} rows and {} columns in train'.format(tweet.shape[0],tweet.shape[1]))
print('There are {} rows and {} columns in train'.format(test.shape[0],test.shape[1]))

## Columns  
`id` - a unique identifier for each tweet  
`text` - the text of the tweet    
`location` - the location the tweet was sent from (may be blank)  
`keyword` - a particular keyword from the tweet (may be blank)  
`target` - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)

## Class distribution  
*클래스 분포*

Before we begin with anything else,let's check the class distribution.There are only two classes 0 and 1.  
*다른 작업을 시작하기 전에 클래스 분포를 확인하겠습니다. 클래스 0과 1만 두 개 있습니다.*

In [None]:
x=tweet.target.value_counts()
sns.barplot(x.index,x)
plt.gca().set_ylabel('samples')

There are more tweets with class 0 ( No disaster) than class 1 ( disaster tweets).  
*클래스 1(재해 트윗)보다 클래스 0(재해 없음)의 트윗이 더 많습니다.*

## Exploratory Data Analysis of tweets  
*트윗의 탐색적 데이터 분석*

First,we will do very basic analysis,that is character level,word level and sentence level analysis.  
*먼저 문자 수준, 단어 수준 및 문장 수준 분석인 매우 기본적인 분석을 수행합니다.*

### Number of characters in tweets  
*트윗의 문자 수*

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=tweet[tweet['target']==1]['text'].str.len()
ax1.hist(tweet_len,color='red')
ax1.set_title('disaster tweets')
tweet_len=tweet[tweet['target']==0]['text'].str.len()
ax2.hist(tweet_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')
plt.show()


The distribution of both seems to be almost same.120 t0 140 characters in a tweet are the most common among both.  
*둘의 분포는 거의 같은 것 같습니다. 120 ~ 140 트윗의 문자가 둘 중 가장 일반적입니다.*

### Number of words in a tweet  
*트윗의 단어 수*

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=tweet[tweet['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len,color='red')
ax1.set_title('disaster tweets')
tweet_len=tweet[tweet['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Words in a tweet')
plt.show()


###  Average word length in a tweet  

*트윗의 평균 단어 길이*

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
word=tweet[tweet['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('disaster')
word=tweet[tweet['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Not disaster')
fig.suptitle('Average word length in each tweet')

In [None]:
def create_corpus(target):
    corpus=[]
    
    for x in tweet[tweet['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

### Common stopwords in tweets  
*트윗의 일반적인 불용어*

First we  will analyze tweets with class 0.  
*먼저 클래스 0의 트윗을 분석합니다.*

In [None]:
corpus=create_corpus(0)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 


In [None]:
x,y=zip(*top)
plt.bar(x,y)

Now,we will analyze tweets with class 1.  
*이제 클래스 1로 트윗을 분석하겠습니다.*

In [None]:
corpus=create_corpus(1)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    


x,y=zip(*top)
plt.bar(x,y)

In both of them,"the" dominates which is followed by "a" in class 0 and "in" in class 1.  
*둘 다 "the"가 지배적이며 클래스 0에서는 "a"가, 클래스 1에서는 "in"이 뒤따릅니다.*

### Analyzing punctuations.
*구두점 분석.*

First let's check tweets indicating real disaster.  
*먼저 실제 재난을 나타내는 트윗을 확인합시다.*

In [None]:
plt.figure(figsize=(10,5))
corpus=create_corpus(1)

dic=defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i]+=1
        
x,y=zip(*dic.items())
plt.bar(x,y)

Now,we will move on to class 0.  
*이제 클래스 0으로 넘어갑니다.*

In [None]:
plt.figure(figsize=(10,5))
corpus=create_corpus(0)

dic=defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i]+=1
        
x,y=zip(*dic.items())
plt.bar(x,y,color='green')

### Common words  
*흔한 단어*

In [None]:

counter=Counter(corpus)
most=counter.most_common()
x=[]
y=[]
for word,count in most[:40]:
    if (word not in stop) :
        x.append(word)
        y.append(count)

In [None]:
sns.barplot(x=y,y=x)

Lot of cleaning needed.  
많은 청소가 필요합니다.

### Ngram analysis  
*Ngram 분석*

we will do a bigram (n=2) analysis over the tweets.Let's check the most common bigrams in tweets.  
*우리는 트윗에 대해 bigram(n=2) 분석을 수행할 것입니다. 트윗에서 가장 일반적인 bigram을 확인합시다.*

In [None]:
def get_top_tweet_bigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigrams=get_top_tweet_bigrams(tweet['text'])[:10]
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)

We will need lot of cleaning here..  
*여기 청소가 많이 필요합니다..*

## Data Cleaning  
*데이터 정리*

As we know,twitter tweets always have to be cleaned before we go onto modelling.So we will do some basic cleaning such as spelling correction,removing punctuations,removing html tags and emojis etc.

알다시피, 트위터 트윗은 모델링을 시작하기 전에 항상 정리해야 합니다. 따라서 맞춤법 수정, 구두점 제거, html 태그 및 이모티콘 제거 등과 같은 기본적인 정리 작업을 수행하겠습니다.

In [None]:
df=pd.concat([tweet,test])
df.shape

### Removing urls  
*URL 제거*

In [None]:
example="New competition launched :https://www.kaggle.com/c/nlp-getting-started"

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

remove_URL(example)

In [None]:
df['text']=df['text'].apply(lambda x : remove_URL(x))

### Removing HTML tags  

*HTML 태그 제거*

In [None]:
example = """<div>
<h1>Real or Fake</h1>
<p>Kaggle </p>
<a href="https://www.kaggle.com/c/nlp-getting-started">getting started</a>
</div>"""

In [None]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
print(remove_html(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_html(x))

### Romoving Emojis  

*이모티콘 제거*

In [None]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji("Omg another Earthquake 😔😔")

In [None]:
df['text']=df['text'].apply(lambda x: remove_emoji(x))


### Removing punctuations  
*구두점 제거*

In [None]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

example="I am a #king"
print(remove_punct(example))

In [None]:
df['text']=df['text'].apply(lambda x : remove_punct(x))

### Spelling Correction  
*맞춤법 수정*


Even if I'm not good at spelling I can correct it with python, I will use `pyspellcheker` to do that.  
*철자를 잘 못써도 python으로 고칠 수 있어요, 'pyspellcheker'를 사용해서 수정하겠습니다*

### `pyspellcheker`  
Pure Python Spell Checking based on Peter Norvig’s blog post on setting up a simple spell checking algorithm.  
*간단한 맞춤법 검사 알고리즘 설정에 대한 Peter Norvig의 블로그 게시물을 기반으로 하는 순수 Python 맞춤법 검사입니다.*

It uses a Levenshtein Distance algorithm to find permutations within an edit distance of 2 from the original word. It then compares all permutations (insertions, deletions, replacements, and transpositions) to known words in a word frequency list. Those words that are found more often in the frequency list are more likely the correct results.

*Levenshtein Distance 알고리즘을 사용하여 원래 단어에서 2 편집 거리 내에서 순열을 찾습니다. 그런 다음 모든 순열(삽입, 삭제, 대체 및 전치)을 단어 빈도 목록의 알려진 단어와 비교합니다. 빈도 목록에서 더 자주 발견되는 단어가 올바른 결과일 가능성이 더 큽니다.*  

*(https://pypi.org/project/pyspellchecker/)*

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

In [None]:
#df['text']=df['text'].apply(lambda x : correct_spellings(x)#)

## GloVe for Vectorization  

*벡터화를 위한 GloV*

### `GloVe` : Global Vectors for Word Representation  
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.  
*GloVe는 단어에 대한 벡터 표현을 얻기 위한 비지도 학습 알고리즘입니다. 훈련은 말뭉치에서 집계된 전역 단어 단어 동시 발생 통계에 대해 수행되며 결과 표현은 단어 벡터 공간의 흥미로운 선형 하위 구조를 보여줍니다.*  
*(https://nlp.stanford.edu/pubs/glove.pdf)*

Here we will use GloVe pretrained corpus model to represent our words.It is available in 3 varieties :50D ,100D and 200 Dimentional.We will try 100 D here.  
*여기에서는 GloVe 사전 훈련된 말뭉치 모델을 사용하여 단어를 표현합니다. 50D, 100D 및 200 Dimentional의 3가지 종류가 있습니다. 여기서는 100D를 시도하겠습니다.*

In [None]:

def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus
        
        

In [None]:
corpus=create_corpus(df)

In [None]:
embedding_dict={}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec
            

## Baseline Model  
*기준 모델*

In [None]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])



In [None]:
model.summary()

In [None]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)