In [1]:
import pandas as pd
import numpy as np

disaster_tweets_data = [
    "There has been a massive earthquake in the city!",
    "The hurricane caused widespread destruction in the coastal areas.",
    "A tornado touched down in the town, causing significant damage.",
    "Breaking news: explosion reported in downtown area.",
    "Floods have submerged several neighborhoods after heavy rainfall.",
    "Firefighters are battling a large wildfire on the outskirts of the city.",
    "A major accident on the highway has caused traffic chaos.",
    "The building collapsed, trapping people inside.",
    "Emergency services are responding to a chemical spill.",
    "A train derailment has resulted in multiple casualties."
]

normal_tweets_data = [
    "Beautiful weather today! Perfect for a picnic in the park.",
    "Just finished a great workout at the gym.",
    "Excited to watch the new movie that just came out!",
    "Had a delicious dinner with friends last night.",
    "Enjoying a relaxing day at home with a good book.",
    "Looking forward to the weekend!",
    "Spent the day exploring the city.",
    "Feeling grateful for all the blessings in my life.",
    "Ready for a fresh start!",
    "Listening to my favorite music and feeling good."
]

# Create dataframes
disaster_tweets = pd.DataFrame({"text": disaster_tweets_data})
normal_tweets = pd.DataFrame({"text": normal_tweets_data})

# Display the first few rows of each dataframe
print("Disaster Tweets:")
print(disaster_tweets.head())
print("\nNormal Tweets:")
print(normal_tweets.head())


Disaster Tweets:
                                                text
0   There has been a massive earthquake in the city!
1  The hurricane caused widespread destruction in...
2  A tornado touched down in the town, causing si...
3  Breaking news: explosion reported in downtown ...
4  Floods have submerged several neighborhoods af...

Normal Tweets:
                                                text
0  Beautiful weather today! Perfect for a picnic ...
1          Just finished a great workout at the gym.
2  Excited to watch the new movie that just came ...
3    Had a delicious dinner with friends last night.
4  Enjoying a relaxing day at home with a good book.


In [2]:
from nltk import word_tokenize
from nltk import WordNetLemmatizer
from collections import Counter
from nltk import ngrams

# Lemmatize words
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

disaster_tweets['lemmatized_text'] = disaster_tweets['text'].apply(lemmatize_text)
normal_tweets['lemmatized_text'] = normal_tweets['text'].apply(lemmatize_text)

# Count occurrences of each word
disaster_word_counts = Counter(word_tokenize(' '.join(disaster_tweets['lemmatized_text'])))
normal_word_counts = Counter(word_tokenize(' '.join(normal_tweets['lemmatized_text'])))

# Top 20 words by occurrence
top_disaster_words = disaster_word_counts.most_common(20)
top_normal_words = normal_word_counts.most_common(20)

# Find bigrams and trigrams
disaster_bigrams = list(ngrams(word_tokenize(' '.join(disaster_tweets['lemmatized_text'])), 2))
disaster_trigrams = list(ngrams(word_tokenize(' '.join(disaster_tweets['lemmatized_text'])), 3))
normal_bigrams = list(ngrams(word_tokenize(' '.join(normal_tweets['lemmatized_text'])), 2))
normal_trigrams = list(ngrams(word_tokenize(' '.join(normal_tweets['lemmatized_text'])), 3))

# Top 20 bigrams and trigrams
top_disaster_bigrams = Counter(disaster_bigrams).most_common(20)
top_disaster_trigrams = Counter(disaster_trigrams).most_common(20)
top_normal_bigrams = Counter(normal_bigrams).most_common(20)
top_normal_trigrams = Counter(normal_trigrams).most_common(20)


In [3]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import re
import nltk 
from nltk.corpus import stopwords

def preprocess_text(text):
    # Remove mentions
    text = re.sub(r'@\S+', '', text)
    # Remove punctuations and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

disaster_tweets['preprocessed_text'] = disaster_tweets['lemmatized_text'].apply(preprocess_text)
normal_tweets['preprocessed_text'] = normal_tweets['lemmatized_text'].apply(preprocess_text)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Combine preprocessed tweets
combined_tweets = pd.concat([disaster_tweets['preprocessed_text'], normal_tweets['preprocessed_text']], ignore_index=True)

# Create labels
labels = [1] * len(disaster_tweets) + [0] * len(normal_tweets)

# Vectorize text
vectorizer = CountVectorizer(max_features=1000)  
X = vectorizer.fit_transform(combined_tweets)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=42)

# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Predictions
y_pred = logreg.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall Score:", recall)

print(y_train)


Accuracy: 0.8
F1 Score: 0.8
Recall Score: 0.6666666666666666
[1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1]


In [9]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

# # Plot confusion matrix
# plt.figure(figsize=(8, 6))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Normal', 'Disaster'], yticklabels=['Normal', 'Disaster'])
# plt.xlabel('Predicted')
# plt.ylabel('True')
# plt.title('Confusion Matrix')
# plt.show()


[[2 0]
 [1 2]]
