In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.graph_objects as go
import emoji
import plotly.express as px
import re
import string
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


This project solves a binary classification problem using the [Kaggle Disaster Tweets dataset](https://www.kaggle.com/competitions/nlp-getting-started). The objective is to classify whether a tweet is about a real disaster or not.

We use text preprocessing, TF-IDF vectorization, and multiple machine learning models to build and evaluate predictive systems, choosing the best model based on F1 score.


In [2]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Exploratory Data Analysis (EDA)

To understand the data better, I made visualizations and analysis:

Class distribution: Checked the balance between disaster and non-disaster tweets

Keyword frequency: Plotted top keywords in each class

Location and emoji use: Analyzed the impact of missing values and emoji counts

Text properties: Looked at tweet length, word counts, and duplicates

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [5]:


object_columns = train.select_dtypes(include=['object']).columns

df_missing = train[object_columns].isnull().sum().reset_index()
df_missing.columns = ['column', 'missing_count']
df_missing['missing_percent'] = df_missing['missing_count'] / len(train) * 100

fig = go.Figure(data=[
    go.Bar(name='Missing Count', x=df_missing['column'], y=df_missing['missing_count']),
    go.Bar(name='Missing Percent', x=df_missing['column'], y=df_missing['missing_percent'])
])

fig.update_layout(barmode='group', title='Missing Values in Object Columns of Train Dataset', xaxis_title='Columns', yaxis_title='Count / Percent')
fig.show()

In [6]:
missing_disaster = train[train['target'] == 1]['location'].isnull().sum()
missing_non_disaster = train[train['target'] == 0]['location'].isnull().sum()

fig = go.Figure(data=[
    go.Bar(name='Missing in Disaster Tweets', x=['Location'], y=[missing_disaster]),
    go.Bar(name='Missing in Non-Disaster Tweets', x=['Location'], y=[missing_non_disaster])
])

fig.update_layout(barmode='group', title='Missing Locations in Disaster vs. Non-Disaster Tweets',
                  yaxis_title='Count of Missing Values', xaxis_title='Category')

fig.show()

In [7]:
x = train['target'].value_counts()

fig = go.Figure()
fig.add_trace(go.Pie(
    labels=x.index,
    values=x,
    marker=dict(colors=['red', 'green']),
    hole=0.3 
))

# Customize layout
fig.update_layout(
    title='Distribution of Target Variable',
)

# Show the plot
fig.show()

In [8]:
disaster_train_len = train[train['target'] == 1]['text'].str.split().map(lambda x: len(x))
non_disaster_train_len = train[train['target'] == 0]['text'].str.split().map(lambda x: len(x))

fig = go.Figure()

fig.add_trace(go.Histogram(
    x=disaster_train_len,
    name='Disaster Tweets',
    marker_color='red'
))

fig.add_trace(go.Histogram(
    x=non_disaster_train_len,
    name='Non-Disaster Tweets',
    marker_color='green',
    opacity=0.8
))

fig.update_layout(
    title='Words in a Tweet',
    xaxis_title='Number of Words',
    yaxis_title='Frequency',
    barmode='overlay',
    bargap=0.1
)

fig.show()

print(f"The maximum length of tweets from disaster tweets: {max(disaster_train_len)}")
print(f"The maximum length of tweets from non-disaster tweets: {max(non_disaster_train_len)}")

The maximum length of tweets from disaster tweets: 30
The maximum length of tweets from non-disaster tweets: 31


In [9]:
def count_emojis(text):
    return sum(1 for char in text if char in emoji.EMOJI_DATA)

train['emoji_count'] = train['text'].apply(count_emojis)

disaster_emoji_sum = train[train['target'] == 1]['emoji_count'].sum()
non_disaster_emoji_sum = train[train['target'] == 0]['emoji_count'].sum()

fig = go.Figure(data=[
    go.Bar(name='Disaster Tweets', x=['Disaster'], y=[disaster_emoji_sum], marker_color='red'),
    go.Bar(name='Non-Disaster Tweets', x=['Non-Disaster'], y=[non_disaster_emoji_sum], marker_color='green')
])

fig.update_layout(
    title='Emojis in Tweets',
    xaxis_title='Tweet Category',
    yaxis_title='Total Emoji Count',
    barmode='group'
)

fig.show()

train = train.drop('emoji_count', axis=1)


In [10]:
top20_keyword = train['keyword'].value_counts().head(20)
top20_keyword_disaster = train[train['target'] == 1]['keyword'].value_counts().head(20)
top20_keyword_nodisaster = train[train['target'] == 0]['keyword'].value_counts().head(20)

color_palette = px.colors.qualitative.Plotly

fig = px.bar(y=top20_keyword.index, x=top20_keyword.values, orientation='h',
             title='Top 20 Keywords Overall', labels={'x': 'Count', 'y': 'Keyword'},
             category_orders={'y': list(top20_keyword.index)},
             color_discrete_sequence=color_palette)

fig2 = px.bar(y=top20_keyword_disaster.index, x=top20_keyword_disaster.values, orientation='h',
              title='Top 20 Keywords in Disaster Tweets', labels={'x': 'Count', 'y': 'Keyword'},
              category_orders={'y': list(top20_keyword_disaster.index)},
              color_discrete_sequence=color_palette)

fig3 = px.bar(y=top20_keyword_nodisaster.index, x=top20_keyword_nodisaster.values, orientation='h',
              title='Top 20 Keywords in Non-Disaster Tweets', labels={'x': 'Count', 'y': 'Keyword'},
              category_orders={'y': list(top20_keyword_nodisaster.index)},
              color_discrete_sequence=color_palette)

fig.update_layout(height=800)
fig2.update_layout(height=800)
fig3.update_layout(height=800)

# Create subplots with shared x-axis
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGray')))
fig2.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGray')))
fig3.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGray')))

fig.show()
fig2.show()
fig3.show()

In [11]:
top10_location = train['location'].value_counts().head(10)
top20_location_disaster = train[train['target'] == 1]['location'].value_counts().head(20)
top20_location_nodisaster = train[train['target'] == 0]['location'].value_counts().head(20)

fig = px.bar(y=top10_location.index, x=top10_location.values, orientation='h', 
             title='Top 10 Locations Overall', labels={'x': 'Count', 'y': 'Location'},
             category_orders={'y': list(top10_location.index)})

fig.update_layout(height=600)

fig2 = px.bar(y=top20_location_disaster.index, x=top20_location_disaster.values, orientation='h', 
              title='Top 20 Locations in Disaster Tweets', labels={'x': 'Count', 'y': 'Location'},
              category_orders={'y': list(top20_location_disaster.index)})

fig2.update_layout(height=600)

fig3 = px.bar(y=top20_location_nodisaster.index, x=top20_location_nodisaster.values, orientation='h', 
               title='Top 20 Locations in Non-Disaster Tweets', labels={'x': 'Count', 'y': 'Location'},
               category_orders={'y': list(top20_location_nodisaster.index)})

fig3.update_layout(height=600)

fig.show()
fig2.show()
fig3.show()


In [12]:
duplicated_data = train[train.duplicated(subset=['text'])]
duplicated_data

Unnamed: 0,id,keyword,location,text,target
48,68,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...,0
115,165,aftershock,US,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/vA...,0
119,172,aftershock,Switzerland,320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/TH...,0
164,238,airplane%20accident,,Experts in France begin examining airplane deb...,1
624,898,bioterrorism,,To fight bioterrorism sir.,0
...,...,...,...,...,...
7600,10855,,,Evacuation order lifted for town of Roosevelt:...,1
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1


In [13]:
print(f"raw data: {len(train)}")

train.drop_duplicates(subset=['text'], inplace=True)
print(f"after removing duplicated data: {len(train)}")

raw data: 7613
after removing duplicated data: 7503


In [14]:
train['keyword'] = train['keyword'].fillna(train['keyword'].mode()[0])
train['location'] = train['location'].fillna('Missing')

In [15]:
train.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

### Preprocessing

Tweets were cleaned by:
- Lowercasing
- Removing URLs, hashtags, and mentions
- Removing punctuation and numbers
- Stripping extra spaces


In [16]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'\@[\w]*', '', text)  # remove mentions
    text = re.sub(r'\#[\w]*', '', text)  # remove hashtags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

train['clean_text'] = train['text'].apply(clean_text)
test['clean_text'] = test['text'].apply(clean_text)


I converted text into numerical features using TF-IDF Vectorization. This technique gives more importance to rare but meaningful words and downweights common ones like "the" or "and".

In [17]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train['clean_text'])

In [18]:
tfidf = TfidfVectorizer()
train_tfidf = tfidf.fit_transform(train['clean_text'])

### Model Training and Evaluation

We tested 5 different classifiers:

Logistic Regression

Naive Bayes (Multinomial)

Linear Support Vector Machine (SVM)

Random Forest

XGBoost

Each model was evaluated using 5-fold cross-validation and F1-score as the primary metric, which balances precision and recall.

In [21]:
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "SVM (LinearSVC)": LinearSVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}


from sklearn.model_selection import cross_val_score

mean_scores = {}

for name, clf in classifiers.items():
    try:
        if name == 'Naive Bayes':
            scores = cross_val_score(clf, train_vectors.toarray(), train["target"], cv=5, scoring="f1")
        else:
            scores = cross_val_score(clf, train_vectors, train["target"], cv=5, scoring="f1")
        
        mean_scores[name] = scores.mean()
        print(f"Model: {name}")
        print(f"F1 Scores: {scores}")
        print(f"Mean F1 Score: {mean_scores[name]:.4f}")
        print("="*40)
    except Exception as e:
        print(f"Model: {name} failed with error: {e}")
        print("="*40)

best_model = max(mean_scores, key=mean_scores.get)
print(f"Best Model: {best_model} with Mean F1 Score: {mean_scores[best_model]:.4f}")


Model: Logistic Regression
F1 Scores: [0.61958569 0.52470187 0.60301508 0.58855098 0.70096463]
Mean F1 Score: 0.6074
Model: Naive Bayes
F1 Scores: [0.63951473 0.60200154 0.68562874 0.64633141 0.73448773]
Mean F1 Score: 0.6616
Model: SVM (LinearSVC)
F1 Scores: [0.62994836 0.50570962 0.58367017 0.58686617 0.64732824]
Mean F1 Score: 0.5907
Model: Random Forest
F1 Scores: [0.60245902 0.49655172 0.51477598 0.52385787 0.66426643]
Mean F1 Score: 0.5604
Model: XGBoost
F1 Scores: [0.55555556 0.4411215  0.49320036 0.49160908 0.60322855]
Mean F1 Score: 0.5169
Best Model: Naive Bayes with Mean F1 Score: 0.6616


In [22]:
results_df = pd.DataFrame(mean_scores.items(), columns=['Model', 'Mean F1 Score'])
results_df = results_df.sort_values(by='Mean F1 Score', ascending=False)
results_df.reset_index(drop=True, inplace=True)
results_df


Unnamed: 0,Model,Mean F1 Score
0,Naive Bayes,0.661593
1,Logistic Regression,0.607364
2,SVM (LinearSVC),0.590705
3,Random Forest,0.560382
4,XGBoost,0.516943


### Hyperparameter Tuning

We tuned the two top-performing models using GridSearchCV:

For Multinomial Naive Bayes, we tuned the alpha parameter

For Logistic Regression, we tuned the C (regularization) and solver

This helped us find the best configuration for each model.

In [23]:
param_grid_nb = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0]
}

grid_nb = GridSearchCV(
    MultinomialNB(),
    param_grid=param_grid_nb,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_nb.fit(train_vectors.toarray(), train['target'])  # Must be dense for NB

print("Best alpha for MultinomialNB:", grid_nb.best_params_['alpha'])
print("Best F1 score (CV):", grid_nb.best_score_)


Best alpha for MultinomialNB: 0.5
Best F1 score (CV): 0.6629165718161246


In [24]:
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],  # L2 is the default for LogisticRegression
    'solver': ['liblinear', 'lbfgs']
}

grid_lr = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid=param_grid_lr,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid_lr.fit(train_vectors, train['target'])  # Sparse input is fine for LR

print("Best params for LogisticRegression:", grid_lr.best_params_)
print("Best F1 score (CV):", grid_lr.best_score_)


Best params for LogisticRegression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best F1 score (CV): 0.617862693186835


In [25]:
best_nb_model = grid_nb.best_estimator_
best_lr_model = grid_lr.best_estimator_

# Example: Fit on full train set (split first if using validation)
best_nb_model.fit(train_vectors.toarray(), train['target'])
best_lr_model.fit(train_vectors, train['target'])


In [26]:
X_train, X_val, y_train, y_val = train_test_split(train_vectors, train['target'], test_size=0.2, random_state=42)

In [27]:
X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()


In [28]:
# Naive Bayes 
best_nb_model.fit(X_train_dense, y_train)

# Logistic Regression
best_lr_model.fit(X_train, y_train)


In [30]:
# Naive Bayes
nb_preds = best_nb_model.predict(X_val_dense)

# Logistic Regression
lr_preds = best_lr_model.predict(X_val)


In [31]:
print("=== Naive Bayes Report ===")
print(confusion_matrix(y_val, nb_preds))
print(classification_report(y_val, nb_preds))

print("\n=== Logistic Regression Report ===")
print(confusion_matrix(y_val, lr_preds))
print(classification_report(y_val, lr_preds))


=== Naive Bayes Report ===
[[749 125]
 [194 433]]
              precision    recall  f1-score   support

           0       0.79      0.86      0.82       874
           1       0.78      0.69      0.73       627

    accuracy                           0.79      1501
   macro avg       0.79      0.77      0.78      1501
weighted avg       0.79      0.79      0.79      1501


=== Logistic Regression Report ===
[[763 111]
 [227 400]]
              precision    recall  f1-score   support

           0       0.77      0.87      0.82       874
           1       0.78      0.64      0.70       627

    accuracy                           0.77      1501
   macro avg       0.78      0.76      0.76      1501
weighted avg       0.78      0.77      0.77      1501



In [35]:
final_model = best_nb_model  # or best_nb_model


I split the training data into train and validation sets (80/20 split), then trained the best-tuned models and evaluated them on validation data.

Naive Bayes had the best F1-score for disaster tweets (class 1) and was selected as the final model.

In [42]:
vectorizer = TfidfVectorizer()  
train_vectors = vectorizer.fit_transform(train['clean_text'])

test_vectors = vectorizer.transform(test['clean_text'])

# Dense for Naive Bayes only
test_preds = final_model.predict(test_vectors.toarray() if final_model == best_nb_model else test_vectors)

submission = pd.DataFrame({'id': test['id'], 'target': test_preds})
submission.to_csv('submission.csv', index=False)


After I submitted the CSV file to Kaggle I achieved a public leaderboard score of 0.79344, for the first model without hyperparameter tuning, and a final score of 0.78792 after including hyper parameter tuning. 