### This notebook uses Natural Language Processing (NLP) techniques to classify tweets as either real disasters or not.

# 1. Setup and Data Loading
This section imports necessary libraries and loads the dataset.

In [105]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


The output shows the data files available in the input directory.

- /kaggle/input/nlp-getting-started/sample_submission.csv

- /kaggle/input/nlp-getting-started/train.csv

- /kaggle/input/nlp-getting-started/test.csv

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix,accuracy_score
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

The datasets are loaded into pandas DataFrames.

In [141]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
sample = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

# 2. Exploratory Data Analysis (EDA)
This part of the notebook explores the training data to understand its structure and characteristics.

Display the first 5 rows of the training data.

In [142]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1



**The output shows columns: id, keyword, location, text, and target. The target column indicates whether the tweet is about a real disaster (1) or not (0).**


In [143]:
# Check the shape of the training data
train.shape

(7613, 5)

In [144]:
# Check the columns of the training data.
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [145]:
# Get information about the DataFrame, including data types and non-null values.
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [146]:
# Count the number of tweets for each target class.
train['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [147]:
# Check for missing values in each column.
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [148]:
train['text_length'] = train['text'].apply(len)
train['text_length'].describe()

count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: text_length, dtype: float64

In [149]:
sample_text = train['text'][0]
sample_text

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

# 3. Text Preprocessing
A function clean_text is defined to preprocess the tweet text. This involves converting the text to lowercase, removing numbers and punctuation, and removing English stopwords.

In [150]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [151]:
def clean_text(text):
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove numbers and punctuation
    text = re.sub(r'\d+', '', text)  
    text = text.translate(str.maketrans('', '', string.punctuation))  # noktalama

    # 3. Tokenize
    tokens = text.split()

    # 4.  Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [152]:
train['clean_text'] = train['text'].apply(clean_text)

# 4. Model Training and Evaluation
The preprocessed data is split into training and testing sets, and a Logistic Regression model is trained and evaluated.

In [153]:
# Split data
X = train['clean_text']
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [154]:
# Use TfidfVectorizer to convert the text data into a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    max_df=0.95,
    min_df=5,
    sublinear_tf=True

)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [155]:
# Train a Logistic Regression model. class_weight='balanced' is used to handle the slight class imbalance.
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_vec, y_train)

In [156]:
y_pred = model.predict(X_test_vec)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[718 156]
 [158 491]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       874
           1       0.76      0.76      0.76       649

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523



In [139]:
y_prob = model.predict_proba(X_test_vec)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("Accuracy       : %", round(accuracy * 100, 2))
print("ROC AUC Score  :", round(roc_auc, 4))

Accuracy       : % 79.38
ROC AUC Score  : 0.8551


# 5. Making Predictions on the Test Set
The same preprocessing steps and trained model are used to make predictions on the provided test.csv file.

In [158]:
test['clean_text'] = test['text'].apply(clean_text)

In [159]:
test_final = vectorizer.transform(test['clean_text'])

In [161]:
y_pred_test = model.predict(test_final)


In [164]:

sample["target"] = y_pred_test
sample.to_csv("sample_submission.csv", index=False)


In [165]:
import joblib

In [166]:
joblib.dump(model, "model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']