In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('nlp-getting-started/train.csv')

In [6]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [9]:
df.target.nunique()

2

In [10]:
df.location.fillna('Unk',inplace=True)

In [11]:
df.keyword.fillna('Unk',inplace=True)

In [12]:
df.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [13]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,Unk,Unk,Our Deeds are the Reason of this #earthquake M...,1
1,4,Unk,Unk,Forest fire near La Ronge Sask. Canada,1
2,5,Unk,Unk,All residents asked to 'shelter in place' are ...,1
3,6,Unk,Unk,"13,000 people receive #wildfires evacuation or...",1
4,7,Unk,Unk,Just got sent this photo from Ruby #Alaska as ...,1


In [14]:
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer

In [15]:
vocab_size = 5000

In [16]:
import re

In [17]:
tweets = df.text

In [19]:
stemmer = PorterStemmer()

In [44]:
corpus = []
for sent in tweets:
    review = re.sub('[^a-zA-Z0-9]',' ',sent)
    review = review.lower().split()

    review = [stemmer.stem(word) for word in review if not word in stopwords.words('english')]
    corpus.append(' '.join(review))

In [45]:
corpus[:5]

['deed reason earthquak may allah forgiv us',
 'forest fire near la rong sask canada',
 'resid ask shelter place notifi offic evacu shelter place order expect',
 '13 000 peopl receiv wildfir evacu order california',
 'got sent photo rubi alaska smoke wildfir pour school']

In [46]:
tokenizer = Tokenizer(num_words=5000,oov_token='<OOV>')

In [47]:
tokenizer.fit_on_texts(corpus)

In [48]:
sequences = tokenizer.texts_to_sequences(corpus)

In [49]:
print(sequences[:5])

[[3849, 498, 230, 100, 1425, 3066, 45], [153, 5, 202, 593, 1, 1, 1089], [1426, 552, 1727, 426, 1, 231, 40, 1727, 426, 335, 464], [697, 2571, 13, 2572, 114, 40, 335, 49], [63, 1090, 150, 3850, 1557, 227, 114, 2573, 139]]


In [50]:
corpus[:20]

['deed reason earthquak may allah forgiv us',
 'forest fire near la rong sask canada',
 'resid ask shelter place notifi offic evacu shelter place order expect',
 '13 000 peopl receiv wildfir evacu order california',
 'got sent photo rubi alaska smoke wildfir pour school',
 'rockyfir updat california hwi 20 close direct due lake counti fire cafir wildfir',
 'flood disast heavi rain caus flash flood street manit colorado spring area',
 'top hill see fire wood',
 'emerg evacu happen build across street',
 'afraid tornado come area',
 'three peopl die heat wave far',
 'haha south tampa get flood hah wait second live south tampa gonna gonna fvck flood',
 'rain flood florida tampabay tampa 18 19 day lost count',
 'flood bago myanmar arriv bago',
 'damag school bu 80 multi car crash break',
 'man',
 'love fruit',
 'summer love',
 'car fast',
 'goooooooaaaaaal']

In [51]:
padded = pad_sequences(sequences,maxlen=30,padding='pre')

In [56]:
padded[3]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        697, 2571,   13, 2572,  114,   40,  335,   49])

In [57]:
corpus[3]

'13 000 peopl receiv wildfir evacu order california'

In [59]:
padded[5]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 2574,  204,   49, 1336,  616,
        344,  837,  499,  934,  368,    5, 3851,  114])

In [58]:
corpus[5]

'rockyfir updat california hwi 20 close direct due lake counti fire cafir wildfir'

In [60]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,Unk,Unk,Our Deeds are the Reason of this #earthquake M...,1
1,4,Unk,Unk,Forest fire near La Ronge Sask. Canada,1
2,5,Unk,Unk,All residents asked to 'shelter in place' are ...,1
3,6,Unk,Unk,"13,000 people receive #wildfires evacuation or...",1
4,7,Unk,Unk,Just got sent this photo from Ruby #Alaska as ...,1


In [66]:
from sklearn.preprocessing import LabelEncoder

In [67]:
encoder = LabelEncoder()

In [69]:
df['keyword_encoded'] = encoder.fit_transform(df.keyword)

In [74]:
df.keyword_encoded.nunique()

222

In [78]:
df['location_encoded'] = encoder.fit_transform(df.location)

In [79]:
df.location_encoded.nunique()

3342

In [82]:
df[['keyword_encoded','target']].corr()

Unnamed: 0,keyword_encoded,target
keyword_encoded,1.0,0.047721
target,0.047721,1.0


In [83]:
df[['location_encoded','target']].corr()

Unnamed: 0,location_encoded,target
location_encoded,1.0,-0.000958
target,-0.000958,1.0


In [86]:
df.drop(columns=['location','location_encoded'],inplace=True)

In [87]:
df.head()

Unnamed: 0,id,keyword,text,target,keyword_encoded
0,1,Unk,Our Deeds are the Reason of this #earthquake M...,1,0
1,4,Unk,Forest fire near La Ronge Sask. Canada,1,0
2,5,Unk,All residents asked to 'shelter in place' are ...,1,0
3,6,Unk,"13,000 people receive #wildfires evacuation or...",1,0
4,7,Unk,Just got sent this photo from Ruby #Alaska as ...,1,0


In [120]:
X = padded

In [90]:
y = df['target']

In [91]:
X.shape,y.shape

((7613, 30), (7613,))

In [93]:
corpus[1]

'forest fire near la rong sask canada'

In [95]:
X[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,  153,    5,  202,  593,    1,    1, 1089])

In [96]:
y

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [121]:
type(X),type(y)

(numpy.ndarray, numpy.ndarray)

In [122]:
y = np.array(y).flatten()

In [123]:
type(X),type(y)

(numpy.ndarray, numpy.ndarray)

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,classification_report

In [124]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout,Dense,LSTM,Embedding
from tensorflow.keras.callbacks import EarlyStopping

In [103]:
feature_size = 100

In [146]:
model = Sequential()
model.add(Embedding(vocab_size,feature_size,input_length=30))
model.add(LSTM(200))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))



In [147]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

In [148]:
early_stop = EarlyStopping(monitor='val_loss',patience=2,restore_best_weights=True)

In [149]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=32,callbacks=[early_stop])

Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 32ms/step - accuracy: 0.6615 - loss: 0.6077 - val_accuracy: 0.8024 - val_loss: 0.4435
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.8570 - loss: 0.3352 - val_accuracy: 0.8030 - val_loss: 0.4629
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step - accuracy: 0.8992 - loss: 0.2420 - val_accuracy: 0.7689 - val_loss: 0.5276


<keras.src.callbacks.history.History at 0x1dfa39b2a50>

In [150]:
preds = model.predict(X_test)

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step


In [151]:
preds_labels = np.where(preds >= 0.5, 1,0)

In [152]:
accuracy_score(y_test,preds_labels)

0.8023637557452397

In [153]:
confusion_matrix(y_test,preds_labels)

array([[776,  98],
       [203, 446]], dtype=int64)

In [154]:
f1_score(y_test,preds_labels)

0.7476948868398994

In [161]:
test_df = pd.read_csv('nlp-getting-started/test.csv')

In [162]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [164]:
test_df.drop(['keyword','location'],axis=1,inplace=True)

In [165]:
test_df.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [166]:
corpus_test = []
for sent in test_df['text']:
    review = re.sub('[^a-zA-Z0-9]', ' ', sent)
    review = review.lower().split()
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    corpus_test.append(' '.join(review))

In [167]:
sequences_test = tokenizer.texts_to_sequences(corpus_test)
X_test_final = pad_sequences(sequences_test, maxlen=30, padding='pre')

In [171]:
preds = model.predict(X_test_final)
preds_labels = np.where(preds >=0.5 ,1,0)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step


In [172]:
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': preds_labels.flatten()
})

In [173]:
submission.to_csv('submission.csv',index=False)