In [12]:
!pip install wordcloud




In [16]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib


In [20]:
df = pd.read_csv("twitter_disaster (1).csv")

print(df.head())
print(df.shape)
print(df.columns)


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
(7613, 5)
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


In [21]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [22]:
df['clean_text'] = df['text'].apply(clean_text)


In [23]:
X = df['clean_text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [24]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_df=0.9)),
    ("clf", LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)


In [25]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.824688115561392

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.92      0.86       869
           1       0.86      0.70      0.78       654

    accuracy                           0.82      1523
   macro avg       0.83      0.81      0.82      1523
weighted avg       0.83      0.82      0.82      1523


Confusion Matrix:
 [[796  73]
 [194 460]]


In [26]:
joblib.dump(model, "disaster_model.pkl")
print("✅ Model saved as disaster_model.pkl")


✅ Model saved as disaster_model.pkl


In [27]:
test_tweets = [
    "There is a fire in my area please help",
    "I love this song so much"
]

for t in test_tweets:
    print(t, "->", model.predict([clean_text(t)]))


There is a fire in my area please help -> [1]
I love this song so much -> [0]
