In [5]:
# Importing the Libraries
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import numpy as np
from nltk.stem import PorterStemmer

In [6]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
# Loading the Dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('train.csv')

In [9]:
# Checking observation and feature numbers for train and test data.
print(train_data.shape)
print(test_data.shape)

(7613, 5)
(7613, 5)


In [10]:
# Taking general look at the training datasets.
print(train_data.head(5))

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [11]:
# Taking general look at the testing datasets.
print(test_data.head(5))

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [12]:
def preprocess_text(text):
    # Check for NaN and convert to empty string if necessary
    if isinstance(text, float) and np.isnan(text):
        text = ''
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [13]:
# Apply preprocessing
train_data['cleaned_text'] = train_data['text'].apply(preprocess_text)
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)

In [14]:
# Drop the original text column
train_data.drop(columns=['text'], inplace=True)
test_data.drop(columns=['text'], inplace=True)

In [15]:
# Handle missing values
train_data['cleaned_text'].fillna('', inplace=True)
test_data['cleaned_text'].fillna('', inplace=True)

In [16]:
# Save the cleaned data
train_data.to_csv('cleaned_disaster_tweets.csv', index=False)
test_data.to_csv('cleaned_test_tweets.csv', index=False)

In [17]:
# Load the preprocessed data
tweets_data = pd.read_csv('cleaned_disaster_tweets.csv')
test_data = pd.read_csv('cleaned_test_tweets.csv')

In [18]:
# Ensure there are no NaN values
tweets_data['cleaned_text'].fillna('', inplace=True)

In [19]:
# Split data into features and target
X = train_data['cleaned_text']
y = train_data['target']

In [20]:
# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test = test_data['cleaned_text']

In [21]:
# Ensure there are no NaN values in the split data
X_train.fillna('', inplace=True)
X_val.fillna('', inplace=True)
X_test.fillna('', inplace=True)

In [22]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [23]:
# Word2Vec Vectorization
X_train_tokens = [text.split() for text in X_train]
X_val_tokens = [text.split() for text in X_val]
X_test_tokens = [text.split() for text in X_test]
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.train(X_train_tokens, total_examples=len(X_train_tokens), epochs=10)

def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0.
    for word in words:
        if word in vocabulary:
            n_words += 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

def averaged_word_vectorizer(texts, model, num_features):
    vocabulary = set(model.wv.key_to_index)
    features = [average_word_vectors(text, model, vocabulary, num_features) for text in texts]
    return np.array(features)

X_train_w2v = averaged_word_vectorizer(X_train_tokens, w2v_model, 100)
X_val_w2v = averaged_word_vectorizer(X_val_tokens, w2v_model, 100)
X_test_w2v = averaged_word_vectorizer(X_test_tokens, w2v_model, 100)



In [24]:
# Combine TF-IDF and Word2Vec Features
X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_w2v))
X_val_combined = np.hstack((X_val_tfidf.toarray(), X_val_w2v))
X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_w2v))

In [25]:
# Function to build and evaluate models
def build_and_evaluate_model(ModelClass, X_train, X_val, y_train, y_val):
    model = ModelClass()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    return model, accuracy, report

In [26]:
# Models to evaluate
models = {
    'Random Forest': RandomForestClassifier,
    'SVM': SVC,
    'Logistic Regression': LogisticRegression
}

In [27]:
# Evaluate models on TF-IDF features
results_tfidf = {}
for name, ModelClass in models.items():
    print(f"Evaluating {name} with TF-IDF features...")
    model, accuracy, report = build_and_evaluate_model(ModelClass, X_train_tfidf, X_val_tfidf, y_train, y_val)
    results_tfidf[name] = (model, accuracy, report)
    print(f"{name} Accuracy (TF-IDF): {accuracy}\n")
    print(report)

Evaluating Random Forest with TF-IDF features...
Random Forest Accuracy (TF-IDF): 0.7596848325673013

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       874
           1       0.73      0.68      0.71       649

    accuracy                           0.76      1523
   macro avg       0.76      0.75      0.75      1523
weighted avg       0.76      0.76      0.76      1523

Evaluating SVM with TF-IDF features...
SVM Accuracy (TF-IDF): 0.7997373604727511

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523

Evaluating Logistic Regression with TF-IDF features...
Logistic Regression Accuracy (TF-IDF): 0.8017071569271176

              precision    recall  f1-score   su

In [28]:
# Evaluate models on Word2Vec features
results_w2v = {}
for name, ModelClass in models.items():
    print(f"Evaluating {name} with Word2Vec features...")
    model, accuracy, report = build_and_evaluate_model(ModelClass, X_train_w2v, X_val_w2v, y_train, y_val)
    results_w2v[name] = (model, accuracy, report)
    print(f"{name} Accuracy (Word2Vec): {accuracy}\n")
    print(report)

Evaluating Random Forest with Word2Vec features...
Random Forest Accuracy (Word2Vec): 0.737360472751149

              precision    recall  f1-score   support

           0       0.75      0.82      0.78       874
           1       0.72      0.62      0.67       649

    accuracy                           0.74      1523
   macro avg       0.73      0.72      0.73      1523
weighted avg       0.74      0.74      0.73      1523

Evaluating SVM with Word2Vec features...
SVM Accuracy (Word2Vec): 0.7071569271175312

              precision    recall  f1-score   support

           0       0.68      0.92      0.78       874
           1       0.79      0.42      0.55       649

    accuracy                           0.71      1523
   macro avg       0.74      0.67      0.67      1523
weighted avg       0.73      0.71      0.68      1523

Evaluating Logistic Regression with Word2Vec features...
Logistic Regression Accuracy (Word2Vec): 0.7176625082074852

              precision    recall  f1

In [29]:
# Evaluate models on Combined features
results_combined = {}
for name, ModelClass in models.items():
    print(f"Evaluating {name} with Combined TF-IDF and Word2Vec features...")
    model, accuracy, report = build_and_evaluate_model(ModelClass, X_train_combined, X_val_combined, y_train, y_val)
    results_combined[name] = (model, accuracy, report)
    print(f"{name} Accuracy (Combined): {accuracy}\n")
    print(report)

Evaluating Random Forest with Combined TF-IDF and Word2Vec features...
Random Forest Accuracy (Combined): 0.7413000656598818

              precision    recall  f1-score   support

           0       0.74      0.84      0.79       874
           1       0.74      0.61      0.67       649

    accuracy                           0.74      1523
   macro avg       0.74      0.72      0.73      1523
weighted avg       0.74      0.74      0.74      1523

Evaluating SVM with Combined TF-IDF and Word2Vec features...
SVM Accuracy (Combined): 0.7820091923834537

              precision    recall  f1-score   support

           0       0.76      0.91      0.83       874
           1       0.83      0.61      0.71       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.77      1523
weighted avg       0.79      0.78      0.78      1523

Evaluating Logistic Regression with Combined TF-IDF and Word2Vec features...
Logistic Regression Accuracy (Combine

In [31]:
# Hyperparameter Tuning for the best model
# Let's assume Logistic Regression with Combined features performed the best
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_combined, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_val_combined)
best_accuracy = accuracy_score(y_val, y_pred_best)
best_report = classification_report(y_val, y_pred_best)

print("Best Model Accuracy after Hyperparameter Tuning:", best_accuracy)
print("Best Model Classification Report after Hyperparameter Tuning:\n", best_report)

Best Parameters: {'C': 1, 'solver': 'liblinear'}
Best Model Accuracy after Hyperparameter Tuning: 0.7964543663821405
Best Model Classification Report after Hyperparameter Tuning:
               precision    recall  f1-score   support

           0       0.79      0.87      0.83       874
           1       0.80      0.70      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



In [41]:
# Save the best model and TF-IDF vectorizer
import joblib
joblib.dump(best_model, 'logistic_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(w2v_model, 'word2vec.joblib')

['word2vec.joblib']