In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter

# Introduction

In today's digital age, Twitter plays a crucial role in communication during emergencies. The challenge at hand is to develop machine learning models capable of determining whether a tweet is about a real disaster or not. While it may seem straightforward, it's a task that requires navigating the nuances of language and context.

##  Data Exploration

My analysis begins with an exploration of the dataset. We've visualized class distributions, examined text lengths, and dove into the characteristics of the data. To evaluate model performance, we've used confusion matrices and heatmaps, providing clear insights into how well my models are doing.

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv') 


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

train_data['tokens'] = train_data['text'].apply(nltk.word_tokenize)

stop_words = set(stopwords.words('english'))
train_data['filtered_tokens'] = train_data['tokens'].apply(lambda words: [word for word in words if word not in stop_words])

all_filtered_words = [word for tokens in train_data['filtered_tokens'] for word in tokens if word.isalpha()]



In [None]:
train_data['tokens'] = train_data['text'].apply(lambda x: nltk.word_tokenize(x.lower()))


In [None]:
train_data['word_count'] = train_data['tokens'].apply(len)
plt.hist(train_data['word_count'], bins=50)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
all_words = [word for tokens in train_data['tokens'] for word in tokens if word.isalpha() and word not in stopwords.words('english')]
word_counter = Counter(all_words)

most_common_words = word_counter.most_common(10)

words = [word[0] for word in most_common_words]
frequencies = [word[1] for word in most_common_words]
sizes = [f * 10 for f in frequencies]  

# Create a bubble chart
plt.figure(figsize=(10, 6))
plt.scatter(words, frequencies, s=sizes, alpha=0.5)
plt.xlabel('Words')
plt.ylabel('Frequencies')
plt.title('Bubble Chart of Word Frequencies')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(train_data['text'])
y = train_data['target']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


In [None]:
clf = RandomForestClassifier(n_estimators=125, criterion="gini", min_samples_split=2)
clf.fit(X_train, y_train)


# Hyperparameter Tuning

In [None]:

param_grid = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

best_rf = grid_search.best_estimator_

val_predictions = best_rf.predict(X_val)

print("Accuracy (best estimator):", accuracy_score(y_val, val_predictions))
print("Classification Report (best estimator):\n", classification_report(y_val, val_predictions))


In [None]:
val_predictions = clf.predict(X_val)

In [None]:
print(len(val_predictions))

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_val, val_predictions))
print("Classification Report:\n", classification_report(y_val, val_predictions))

# Conclusion

Upon building a RandomForestClassifier, the model achieved an accuracy of approximately 78.3% on the validation set. The precision, recall, and f1-score in the classification report indicate a reasonably good performance, particularly in classifying the majority class (label 0) with a high recall of 0.93 and an f1-score of 0.83. However, the model showed a tendency to underperform when predicting the minority class (label 1), with a lower recall of 0.60. This suggests that while the model is quite adept at identifying the majority class, it struggles somewhat with the minority class, which could be due to class imbalance or other factors not captured by the features.

The macro-average f1-score of 0.77, while not indicative of a highly imbalanced model, does signal room for improvement, possibly through techniques such as resampling, more advanced feature extraction, or model tuning

In [None]:
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
X_test = vectorizer.transform(test_data['text'])


In [None]:
test_predictions = clf.predict(X_test)

In [None]:
print(len(test_predictions))

In [None]:
submission = pd.DataFrame({ 'id': test_data['id'], 'target': test_predictions })


In [None]:
submission.to_csv('submission.csv', index=False)
