Natural Language Processing with Disaster Tweets
https://www.kaggle.com/c/nlp-getting-started/overview

텍스트 분석
- 데이터 로드
- 데이터 전처리 (스탑워드 등)
- 토큰화 (단어를 index로)
- 패딩 (토큰화한 리스트의 길이를 동일하게)
- 임베딩
- 모델

In [5]:
# %pip install pandas

import numpy as np
import pandas as pd # 데이터 처리, CSV 파일 입출력

import os
for dirname, _, filenames in os.walk('.\\data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

.\data\sample_submission.csv
.\data\test.csv
.\data\train.csv


In [6]:
print(os.getcwd())

c:\Users\서민지\Desktop\ML\HDAT\HDAT\SMJ\3_텍스트_분류


In [7]:
import tensorflow as tf

from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [8]:
train = pd.read_csv('.\data\\train.csv')
test = pd.read_csv('.\data\\test.csv')
submission = pd.read_csv('.\data\sample_submission.csv')

In [9]:
train.shape, test.shape, submission.shape

((7613, 5), (3263, 4), (3263, 2))

In [10]:
train.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [11]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [13]:
labels = train.target
labels

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [14]:
sentences = train.text
sentences

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [15]:
# %pip install nltk
import nltk
nltk.download('stopwords')

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\서민지\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Remove Stopwords, 불용어 제거
from nltk.corpus import stopwords

sw = stopwords.words('english')
np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

In [17]:
def deleteStopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    return " ".join(text)

In [18]:
sentences.head(10)

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
5    #RockyFire Update => California Hwy. 20 closed...
6    #flood #disaster Heavy rain causes flash flood...
7    I'm on top of the hill and I can see a fire in...
8    There's an emergency evacuation happening now ...
9    I'm afraid that the tornado is coming to our a...
Name: text, dtype: object

In [20]:
sentences = sentences.apply(deleteStopwords)
sentences.head(10)

0        deeds reason #earthquake may allah forgive us
1               forest fire near la ronge sask. canada
2    residents asked 'shelter place' notified offic...
3    13,000 people receive #wildfires evacuation or...
4    got sent photo ruby #alaska smoke #wildfires p...
5    #rockyfire update => california hwy. 20 closed...
6    #flood #disaster heavy rain causes flash flood...
7                       i'm top hill see fire woods...
8    there's emergency evacuation happening buildin...
9                    i'm afraid tornado coming area...
Name: text, dtype: object

In [24]:
# Train, Validation 나누기

# %pip install scikit-learn

from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(sentences,
                                                                labels, test_size=0.2, random_state=2020)

Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.6 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 60.6/60.6 kB 799.0 kB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.6 MB 8.6 MB/s eta 0:00:02
   --- ------------------------------------ 0.8/10.6 MB 10.0 MB/s eta 0:00:01
   ----- ---------------------------------- 1.4/10.6 MB 9

In [25]:
train_sentences.shape, val_sentences.shape, train_labels.shape, val_labels.shape

((6090,), (1523,), (6090,), (1523,))

In [26]:
# 토큰화
vocab_size = 1000
token = Tokenizer(num_words=vocab_size)
token.fit_on_texts(sentences) # sentences 데이터 기준으로 빈번한 1000개의 단어 학습

In [27]:
print(train_sentences[:5])

6477               still sunk i've actually met idol ????
2524    times desolation trouble daniel's persistent p...
439        @casper_rmg @bestcomedyvine whats cracking cuz
1824                    @olliebailey11 havnt crashed ? ??
4386    remove http://t.co/7ieiz619h0 linkury browser ...
Name: text, dtype: object


In [28]:
train_sequences = token.texts_to_sequences(train_sentences)
val_sequences = token.texts_to_sequences(val_sentences)
print(train_sequences[:5])

[[24, 373, 267, 557], [178, 398, 447, 388, 133, 3, 1, 2], [], [330], [3, 1, 2, 451, 3, 1, 2, 3, 1, 2]]


In [64]:
# 패딩

trunc_type = 'post'
padding_type = 'post'
max_length = 120
train_pad = pad_sequences(train_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)
val_pad = pad_sequences(val_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

In [65]:
# 모델

train_labels = np.asarray(train_labels)
val_labels = np.asarray(val_labels)

train_labels

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [79]:
embedding_dim = 64
input_shape = train_pad.shape

model = tf.keras.Sequential([
    Embedding(1000, embedding_dim),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

In [80]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [81]:
model.build(input_shape)    # unbuilt 이슈..

In [82]:
model.summary()

In [83]:
model.fit(train_pad, train_labels, validation_data=(val_pad, val_labels),
          epochs=3,
          verbose=2)

Epoch 1/3
191/191 - 23s - 119ms/step - accuracy: 0.7122 - loss: 0.5572 - val_accuracy: 0.7978 - val_loss: 0.4514
Epoch 2/3
191/191 - 14s - 75ms/step - accuracy: 0.8174 - loss: 0.4229 - val_accuracy: 0.8089 - val_loss: 0.4413
Epoch 3/3
191/191 - 14s - 72ms/step - accuracy: 0.8300 - loss: 0.3925 - val_accuracy: 0.8017 - val_loss: 0.4540


<keras.src.callbacks.history.History at 0x25b45724d10>

In [84]:
# 테스트 데이터 전처리 및 예측

test_sequences = test.text
test_sequences = test_sequences.apply(deleteStopwords)
test_sequences = token.texts_to_sequences(test_sequences)
test_pad = pad_sequences(test_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

In [85]:
y_pred = model.predict(test_pad)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step


In [86]:
sub_file = pd.DataFrame()
sub_file['id'] = test['id']
sub_file['target'] = y_pred.round().astype(int)
sub_file.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [87]:
sub_file.to_csv('.\data\submission_lstm.csv', index=False)

# score: 0.77995 (dropout(0.5))
# score: 0.77719 (dropout(0.4))