kaggle 데이터셋 : https://www.kaggle.com/c/nlp-getting-started/overview 

In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('./kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./kaggle/input\nlp-getting-started\sample_submission.csv
./kaggle/input\nlp-getting-started\test.csv
./kaggle/input\nlp-getting-started\train.csv


In [2]:
import tensorflow as tf

from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
train = pd.read_csv('./kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('./kaggle/input/nlp-getting-started/test.csv')
submission = pd.read_csv('./kaggle/input/nlp-getting-started/sample_submission.csv')

In [4]:
train.shape, test.shape, submission.shape

((7613, 5), (3263, 4), (3263, 2))

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [8]:
labels = train.target
labels

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [9]:
sentences = train.text
sentences

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [12]:
import nltk
nltk.__version__
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

#### 불용어 제거 부분 수정 필요!!

In [13]:
# 불용어 제거
from nltk.corpus import stopwords
sw = stopwords.words('english')
np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

In [14]:
def stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)
sentences = sentences.apply(stopwords)
sentences.head(10)

0        deeds reason #earthquake may allah forgive us
1               forest fire near la ronge sask. canada
2    residents asked 'shelter place' notified offic...
3    13,000 people receive #wildfires evacuation or...
4    got sent photo ruby #alaska smoke #wildfires p...
5    #rockyfire update => california hwy. 20 closed...
6    #flood #disaster heavy rain causes flash flood...
7                       i'm top hill see fire woods...
8    there's emergency evacuation happening buildin...
9                    i'm afraid tornado coming area...
Name: text, dtype: object

### Train과 Val 나누기

In [16]:
from sklearn.model_selection import train_test_split
train_sentences, valid_sentences, train_labels, valid_labels = train_test_split(sentences, labels, test_size=0.2, random_state=2020)

In [17]:
train_sentences.shape, valid_sentences.shape, train_labels.shape, valid_labels.shape

((6090,), (1523,), (6090,), (1523,))

### 토큰화

In [18]:
vocab_size = 1000
token = Tokenizer(num_words = vocab_size)
token.fit_on_texts(sentences)

In [19]:
print(train_sentences[:5])
train_sequences = token.texts_to_sequences(train_sentences)
valid_sequences = token.texts_to_sequences(valid_sentences)
print(train_sentences[:5])

6477               still sunk i've actually met idol ????
2524    times desolation trouble daniel's persistent p...
439        @casper_rmg @bestcomedyvine whats cracking cuz
1824                    @olliebailey11 havnt crashed ? ??
4386    remove http://t.co/7ieiz619h0 linkury browser ...
Name: text, dtype: object
6477               still sunk i've actually met idol ????
2524    times desolation trouble daniel's persistent p...
439        @casper_rmg @bestcomedyvine whats cracking cuz
1824                    @olliebailey11 havnt crashed ? ??
4386    remove http://t.co/7ieiz619h0 linkury browser ...
Name: text, dtype: object


### 패딩

In [20]:
trunc_type = 'post'
padding_type = 'post'
max_length = 120
train_pad = pad_sequences(train_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)
valid_pad = pad_sequences(valid_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

### 모델

In [21]:
train_labels = np.asarray(train_labels)
valid_labels = np.asarray(valid_labels)

In [22]:
embedding_dim = 64
model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim),
    Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    Bidirectional(tf.keras.layers.LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [23]:
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
model.fit(train_pad, train_labels, validation_data=(valid_pad, valid_labels), epochs=3, verbose=2)

Epoch 1/3
191/191 - 14s - 74ms/step - accuracy: 0.7143 - loss: 0.5547 - val_accuracy: 0.8011 - val_loss: 0.4613
Epoch 2/3
191/191 - 9s - 45ms/step - accuracy: 0.8154 - loss: 0.4252 - val_accuracy: 0.8017 - val_loss: 0.4444
Epoch 3/3
191/191 - 9s - 45ms/step - accuracy: 0.8333 - loss: 0.3959 - val_accuracy: 0.8024 - val_loss: 0.4501


<keras.src.callbacks.history.History at 0x22b2837bf40>

### 데이터 전처리 및 예측

In [25]:
test_sequences = test.text
test_sequences = test_sequences.apply(stopwords)
test_sequences = token.texts_to_sequences(test_sequences)
test_pad = pad_sequences(test_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

In [26]:
y_pred=model.predict(test_pad)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step


In [27]:
sub_file=pd.DataFrame()
sub_file['id']=test['id']
sub_file['target']=y_pred.round().astype(int)
sub_file.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [28]:
sub_file.to_csv('submission_lstm.csv', index=False)