In [None]:
import pandas as pd
import zipfile
with zipfile.ZipFile("nlp-getting-started.zip", 'r') as zip_ref:
    zip_ref.extractall("disaster_tweets")


In [None]:
train_df = pd.read_csv("disaster_tweets/train.csv")
test_df  = pd.read_csv("disaster_tweets/test.csv")

In [None]:
print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
train_df.head()

Training data shape: (7613, 5)
Test data shape: (3263, 4)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [None]:
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text']  = test_df['text'].apply(clean_text)


In [None]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders cal...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfires pou...


In [None]:
train_df = train_df[['clean_text', 'target']]
test_df = test_df[['clean_text']]
train_df.head()

Unnamed: 0,clean_text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words='english'
)
X_train = tfidf.fit_transform(train_df['clean_text'])
X_test  = tfidf.transform(test_df['clean_text'])
y_train = train_df['target']
print("Training data shape:", X_train.shape)

Training data shape: (7613, 5000)


In [None]:
#Random_Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_tr, y_tr)

y_pred_rf = rf.predict(X_val)

print(" Random Forest Results:")
print("Accuracy:", accuracy_score(y_val, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_val, y_pred_rf))

 Random Forest Results:
Accuracy: 0.7708470124753776

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.81       874
           1       0.77      0.66      0.71       649

    accuracy                           0.77      1523
   macro avg       0.77      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



In [None]:
#XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb.fit(X_tr, y_tr)
y_pred_xgb = xgb.predict(X_val)

print("XGBoost Results:")
print("Accuracy:", accuracy_score(y_val, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_val, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results:
Accuracy: 0.7728168089297439

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.88      0.82       874
           1       0.80      0.63      0.70       649

    accuracy                           0.77      1523
   macro avg       0.78      0.75      0.76      1523
weighted avg       0.78      0.77      0.77      1523



In [None]:
#word2vec
import pandas as pd
import numpy as np
!pip install gensim
!pip install tensorflow
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

texts = train_df['clean_text'].apply(lambda x: x.split())

w2v_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=2, workers=4)
w2v_model.train(texts, total_examples=len(texts), epochs=10)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['clean_text'])
vocab_size = len(tokenizer.word_index) + 1

X = tokenizer.texts_to_sequences(train_df['clean_text'])
X = pad_sequences(X, maxlen=50)
y = train_df['target'].values

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=50, trainable=True))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, epochs=10, batch_size=64, validation_split=0.2, verbose=1)



Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0




Epoch 1/10




[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 170ms/step - accuracy: 0.6379 - loss: 0.6246 - val_accuracy: 0.7577 - val_loss: 0.4932
Epoch 2/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 157ms/step - accuracy: 0.7983 - loss: 0.4538 - val_accuracy: 0.7787 - val_loss: 0.4680
Epoch 3/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 157ms/step - accuracy: 0.8794 - loss: 0.3024 - val_accuracy: 0.7905 - val_loss: 0.4530
Epoch 4/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 158ms/step - accuracy: 0.9398 - loss: 0.1788 - val_accuracy: 0.7597 - val_loss: 0.5295
Epoch 5/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 172ms/step - accuracy: 0.9610 - loss: 0.1129 - val_accuracy: 0.7807 - val_loss: 0.5490
Epoch 6/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 159ms/step - accuracy: 0.9768 - loss: 0.0682 - val_accuracy: 0.7472 - val_loss: 0.5921
Epoch 7/10
[1m96/96[0m [32m━━━

<keras.src.callbacks.history.History at 0x7ec4ee598380>

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 160ms/step - accuracy: 0.9874 - loss: 0.0301 - val_accuracy: 0.9173 - val_loss: 0.2912
Epoch 2/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 178ms/step - accuracy: 0.9868 - loss: 0.0315 - val_accuracy: 0.9166 - val_loss: 0.3176
Epoch 3/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 159ms/step - accuracy: 0.9876 - loss: 0.0249 - val_accuracy: 0.9212 - val_loss: 0.3364
Epoch 4/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 173ms/step - accuracy: 0.9852 - loss: 0.0299 - val_accuracy: 0.9166 - val_loss: 0.3330
Epoch 5/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 186ms/step - accuracy: 0.9887 - loss: 0.0237 - val_accuracy: 0.9140 - val_loss: 0.3056
Epoch 6/10
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 355ms/step - accuracy: 0.9870 - loss: 0.0255 - val_accuracy: 0.9094 - val_loss: 0.3548
Epoch 7/10
[1m96/96[

<keras.src.callbacks.history.History at 0x7ec4e69b39e0>

In [None]:
import pickle

# Save ML models
pickle.dump(rf, open('random_forest.pkl', 'wb'))
pickle.dump(xgb, open('xgboost.pkl', 'wb'))

# Save the tokenizer and deep learning model
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))
model.save('lstm_model.h5')  # Save the LSTM model

# Save TF-IDF if used
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))


