In [2]:
import pandas as pd

In [3]:
data = pd.read_csv(r"D:\Disaster Tweet Classification2\final_dataset_mini_balanced.csv")

In [4]:
data.head()

Unnamed: 0,text,label
0,Massive earthquake in Osaka. Power outages rep...,Earthquake
1,7.0 magnitude earthquake in Miami has triggere...,Earthquake
2,"""The Cape Town drought in South Africa has cau...",Drought
3,Aftershocks continue in Indonesia following ye...,Earthquake
4,The ground is shaking #emergency #earthquake #...,Earthquake


In [5]:
data.isnull().sum()

text     0
label    0
dtype: int64

In [6]:
data.shape

(95271, 2)

In [7]:
data['label'].value_counts()

label
Earthquake             8661
Drought                8661
Volcanic Eruption      8661
Wildfire               8661
Non-Disaster           8661
Tsunami                8661
Hurricane              8661
Cyclone                8661
Industrial Accident    8661
Typhoon                8661
Flood                  8661
Name: count, dtype: int64

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)

data['processed_text'] = data['text'].apply(preprocess_text)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['labels'] = le.fit_transform(data['label'])

X_train, X_val, y_train, y_val = train_test_split(
    data['processed_text'], data['label'], test_size=0.2, random_state=42
)


In [10]:
data.head()

Unnamed: 0,text,label,processed_text,labels
0,Massive earthquake in Osaka. Power outages rep...,Earthquake,massive earthquake osaka power outage reported...,2
1,7.0 magnitude earthquake in Miami has triggere...,Earthquake,magnitude earthquake miami triggered tsunami w...,2
2,"""The Cape Town drought in South Africa has cau...",Drought,cape town drought south africa caused water sc...,1
3,Aftershocks continue in Indonesia following ye...,Earthquake,aftershock continue indonesia following yester...,2
4,The ground is shaking #emergency #earthquake #...,Earthquake,ground shaking emergency earthquake tokyo,2


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MultinomialNB())
])

model.fit(X_train, y_train)
preds = model.predict(X_val)

print("Accuracy:", accuracy_score(y_val, preds))
print(classification_report(y_val, preds, target_names=le.classes_))


Accuracy: 0.9869325636315928
                     precision    recall  f1-score   support

            Cyclone       1.00      0.99      0.99      1657
            Drought       0.99      0.99      0.99      1696
         Earthquake       0.99      0.96      0.98      1723
              Flood       0.98      0.98      0.98      1685
          Hurricane       1.00      0.99      0.99      1815
Industrial Accident       1.00      0.99      1.00      1773
       Non-Disaster       0.98      1.00      0.99      1745
            Tsunami       0.95      0.99      0.97      1749
            Typhoon       1.00      1.00      1.00      1752
  Volcanic Eruption       1.00      0.99      0.99      1738
           Wildfire       0.97      0.99      0.98      1722

           accuracy                           0.99     19055
          macro avg       0.99      0.99      0.99     19055
       weighted avg       0.99      0.99      0.99     19055



In [13]:
import joblib

joblib.dump(model, 'disaster_multiclass_model.pkl')
joblib.dump(le, 'label_encoder.pkl')




['label_encoder.pkl']