In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
train_df = pd.read_csv('train.csv',delimiter=',')
test_df = pd.read_csv('test.csv',delimiter=',')

In [10]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [12]:
train_df['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [13]:
train_df['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [14]:
train_df['location'].value_counts()

location
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: count, Length: 3341, dtype: int64

Column 'location' will not be used in this model because there are lots of noise in this dataset.

Preprocessing
lower case and tokenization-> removing stop words -> stemming/lemmatization

Feature extraction


# Preprocessing text column

In [15]:
# import libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack     # libraries for combining features
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

In [16]:
nltk.download('punkt')  # Download the punkt tokenizer

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sky\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized)

In [18]:
# Preprocess the text data
train_df['text'] = train_df['text'].apply(lemmatize_text)
test_df['text'] = test_df['text'].apply(lemmatize_text)

In [19]:
y = train_df['target']

In [20]:
text = train_df['text']

In [21]:
count_vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2))
text_feat = count_vectorizer.fit_transform(text)

In [22]:
# Preprocess for keyword column
keyword_encoder = LabelEncoder()
keyword_feat = keyword_encoder.fit_transform(train_df['keyword'])
keyword_feat_test = keyword_encoder.transform(test_df['keyword'])
combined_feat = hstack([keyword_feat.reshape(-1,1),text_feat])


In [23]:
# Split data for training and testing
x_train,x_test, y_train, y_test = train_test_split(combined_feat,y,train_size=0.8,random_state= 42)

# Naive Bayes model

In [24]:
# Define model
naive_bayes = MultinomialNB()

# Set parameters
parameters = {
    'alpha' : [0.01,0.05,0.1,0.5,1,5,10]
}

# CV for finding the best parameters
clf_nb = GridSearchCV(naive_bayes,param_grid=parameters, cv=5,verbose=4,n_jobs=-1)

In [25]:
clf_nb.fit(x_train,y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [26]:
clf_nb.best_estimator_

In [27]:
cv_result = clf_nb.cv_results_
pd.DataFrame(cv_result)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023641,0.003446,0.007082,0.001466,0.01,{'alpha': 0.01},0.752053,0.769294,0.764368,0.746305,0.746305,0.755665,0.009484,5
1,0.031872,0.008348,0.013291,0.010559,0.05,{'alpha': 0.05},0.76601,0.775862,0.780788,0.760263,0.7578,0.768144,0.008867,3
2,0.026685,0.006871,0.01011,0.007657,0.1,{'alpha': 0.1},0.772578,0.775862,0.785714,0.76601,0.762726,0.772578,0.008044,2
3,0.027164,0.007247,0.01416,0.010897,0.5,{'alpha': 0.5},0.776683,0.775041,0.797209,0.781609,0.77422,0.780952,0.008523,1
4,0.03098,0.017898,0.004824,0.000868,1.0,{'alpha': 1},0.756979,0.764368,0.781609,0.755337,0.764368,0.764532,0.009309,4
5,0.024603,0.003194,0.011016,0.012059,5.0,{'alpha': 5},0.621511,0.635468,0.655993,0.642036,0.650246,0.641051,0.012015,6
6,0.019664,0.002453,0.007478,0.005711,10.0,{'alpha': 10},0.587028,0.602627,0.606732,0.605911,0.604269,0.601314,0.007281,7


In [28]:
# Prediction and evaluation
y_pred_nb = clf_nb.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Accuracy: {accuracy:.2f}')
print(f'F1-score: {f1_score(y_test, y_pred_nb)}')
print(classification_report(y_test, y_pred_nb))


Accuracy: 0.78
F1-score: 0.7307692307692308
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       874
           1       0.76      0.70      0.73       649

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



# Logistic Regression

In [29]:
# Define model
log = LogisticRegression()

# Set parameters
parameters ={
    'C': [0.01,0.1,1,10],
    'max_iter' : [1000,2000]
}

# CV for finding the best parameters
clf_log = GridSearchCV(log,param_grid=parameters, cv=5,verbose=4,n_jobs=-1)
clf_log.fit(x_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [30]:
clf_log.best_estimator_

In [31]:
cv_result = clf_log.cv_results_
pd.DataFrame(cv_result)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.341176,1.146493,0.004868,0.000662,0.01,1000,"{'C': 0.01, 'max_iter': 1000}",0.730706,0.736453,0.712644,0.694581,0.738095,0.722496,0.016624,7
1,5.351005,0.915548,0.00597,0.001765,0.01,2000,"{'C': 0.01, 'max_iter': 2000}",0.730706,0.736453,0.712644,0.694581,0.738095,0.722496,0.016624,7
2,16.049495,1.952558,0.005457,0.000448,0.1,1000,"{'C': 0.1, 'max_iter': 1000}",0.770115,0.800493,0.789819,0.760263,0.792282,0.782594,0.014972,5
3,15.932435,2.212019,0.005482,0.000866,0.1,2000,"{'C': 0.1, 'max_iter': 2000}",0.770115,0.800493,0.789819,0.760263,0.792282,0.782594,0.014972,5
4,45.075927,3.444331,0.006201,0.001596,1.0,1000,"{'C': 1, 'max_iter': 1000}",0.78243,0.803777,0.80624,0.764368,0.797209,0.790805,0.015598,1
5,45.852687,3.044786,0.005218,0.001014,1.0,2000,"{'C': 1, 'max_iter': 2000}",0.78243,0.803777,0.80624,0.764368,0.797209,0.790805,0.015598,1
6,80.723546,0.70093,0.00622,0.000948,10.0,1000,"{'C': 10, 'max_iter': 1000}",0.779146,0.798851,0.805419,0.761084,0.79803,0.788506,0.016269,3
7,64.295076,18.760413,0.003059,0.001147,10.0,2000,"{'C': 10, 'max_iter': 2000}",0.777504,0.798851,0.804598,0.760263,0.796388,0.787521,0.016383,4


In [32]:
# Prediction and evaluation
y_pred_log = clf_log.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_log)
print(f'Accuracy: {accuracy:.2f}')
print(f'F1-score: {f1_score(y_test, y_pred_log)}')
print(classification_report(y_test, y_pred_log))

Accuracy: 0.80
F1-score: 0.746166950596252
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       874
           1       0.83      0.67      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523



# SVM

In [33]:
# Define model
svm = SVC(max_iter=-1)

# Set parameters
parameters ={
    'C': [0.01,0.1,1,10],
    'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']
}

# CV for finding the best parameters
clf_svm = GridSearchCV(svm ,param_grid=parameters, cv=5,verbose=4,n_jobs=-1)
clf_svm.fit(x_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [34]:
clf_svm.best_estimator_

In [35]:
clf_svm.best_score_

0.7870279146141215

In [36]:
cv_result = clf_svm.cv_results_
pd.DataFrame(cv_result)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,20.511973,1.138756,1.886511,0.096978,0.01,linear,"{'C': 0.01, 'kernel': 'linear'}",0.727422,0.754516,0.728243,0.713465,0.756158,0.735961,0.016678,4
1,8.214587,0.297017,1.928634,0.168785,0.01,poly,"{'C': 0.01, 'kernel': 'poly'}",0.568966,0.568966,0.569787,0.569787,0.569787,0.569458,0.000402,7
2,7.930125,0.059887,2.048088,0.061095,0.01,rbf,"{'C': 0.01, 'kernel': 'rbf'}",0.568966,0.568966,0.569787,0.569787,0.569787,0.569458,0.000402,7
3,8.46116,0.103859,2.072519,0.063263,0.01,sigmoid,"{'C': 0.01, 'kernel': 'sigmoid'}",0.568966,0.568966,0.569787,0.569787,0.569787,0.569458,0.000402,7
4,130.925607,4.813696,1.591546,0.009363,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.777504,0.800493,0.805419,0.758621,0.793103,0.787028,0.017053,1
5,8.67672,0.338617,1.859835,0.040478,0.1,poly,"{'C': 0.1, 'kernel': 'poly'}",0.568966,0.568966,0.569787,0.569787,0.569787,0.569458,0.000402,7
6,7.914232,0.264994,2.266772,0.182395,0.1,rbf,"{'C': 0.1, 'kernel': 'rbf'}",0.568966,0.568966,0.569787,0.569787,0.569787,0.569458,0.000402,7
7,5.082756,0.267928,1.121313,0.025356,0.1,sigmoid,"{'C': 0.1, 'kernel': 'sigmoid'}",0.537767,0.522167,0.513136,0.500821,0.531199,0.521018,0.013081,14
8,157.224755,6.541056,1.570542,0.039104,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.772578,0.794745,0.793103,0.749589,0.787356,0.779475,0.016863,2
9,9.744256,0.22597,1.774415,0.021441,1.0,poly,"{'C': 1, 'kernel': 'poly'}",0.568966,0.568966,0.569787,0.569787,0.569787,0.569458,0.000402,7


In [37]:
# Prediction and evaluation
y_pred_svm = clf_svm.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy: {accuracy:.2f}')
print(f'F1-score: {f1_score(y_test, y_pred_svm)}')
print(classification_report(y_test, y_pred_svm))

Accuracy: 0.80
F1-score: 0.7378472222222223
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       874
           1       0.84      0.65      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.79      1523
weighted avg       0.81      0.80      0.80      1523



# Deep learning

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [39]:
# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text)
X = tokenizer.texts_to_sequences(text)
X = pad_sequences(X, maxlen=10)

# Convert labels to numpy array
y = np.array(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 18ms/step - accuracy: 0.6500 - loss: 0.6052 - val_accuracy: 0.7787 - val_loss: 0.4822
Epoch 2/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.8379 - loss: 0.3778 - val_accuracy: 0.7590 - val_loss: 0.5056
Epoch 3/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8869 - loss: 0.2846 - val_accuracy: 0.7590 - val_loss: 0.5378
Epoch 4/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9156 - loss: 0.2205 - val_accuracy: 0.7571 - val_loss: 0.6478
Epoch 5/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9377 - loss: 0.1654 - val_accuracy: 0.7406 - val_loss: 0.7487
Epoch 6/20
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9535 - loss: 0.1272 - val_accuracy: 0.7374 - val_loss: 0.9318
Epoch 7/20
[1m191/191

<keras.src.callbacks.history.History at 0x1a24e3e77d0>

In [40]:
y_pred_dl = model.predict(X_test)
y_pred_dl_binary = (y_pred_dl > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred_dl_binary)
print(f'Accuracy: {accuracy:.2f}')
print(f'F1-score: {f1_score(y_test, y_pred_dl_binary)}')
print(classification_report(y_test, y_pred_dl_binary))

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
Accuracy: 0.70
F1-score: 0.6581709145427287
              precision    recall  f1-score   support

           0       0.75      0.72      0.73       874
           1       0.64      0.68      0.66       649

    accuracy                           0.70      1523
   macro avg       0.70      0.70      0.70      1523
weighted avg       0.70      0.70      0.70      1523



# Test

Based on performance evaluation of Naive Bayes, Logistic Regression, and a Deep Learning model, Logistic Regression emerged as the superior choice. This section focuses on utilizing the previously trained Logistic Regression model to generate predictions on the held-out test dataset.

In [46]:
text_test = test_df['text'].apply(lemmatize_text)
text_test_feat = count_vectorizer.transform(text_test)
combined_feat_test = hstack([keyword_feat_test.reshape(-1,1),text_test_feat])
y_pred_final = clf_svm.predict(combined_feat_test)

In [47]:
y_pred_final

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [48]:
test_df['target'] = y_pred_final

In [49]:
sol_df = test_df[['id','target']]
sol_df.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [50]:
sol_df.to_csv('prediction.csv')

After submit prediction to Kaggle on 1-Aug-24, the score of this notebook is 0.80079, ranking 401. I used only text for training a model.

The second submission was on 2-Aug-24, I've included keyword column into my feature. The score has a littile increase to 0.80232, ranking 374.

Then, performing lemmatization to the text column before train model. The result minor drops to 0.80194.