In [1]:
import numpy as np 
import pandas as pd 
import matplotlib as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [3]:
train_df = pd.read_csv(r"C:\Users\Nourhan Yehia\Desktop\Jupyter\nlp project\nlp-getting-started\train.csv")
test_df  = pd.read_csv(r"C:\Users\Nourhan Yehia\Desktop\Jupyter\nlp project\nlp-getting-started\test.csv")
sub_df   = pd.read_csv(r"C:\Users\Nourhan Yehia\Desktop\Jupyter\nlp project\nlp-getting-started\sample_submission.csv")

train_df.head(), test_df.head(), sub_df.head()



(   id keyword location                                               text  \
 0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
 1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
 2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
 3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
 4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
 
    target  
 0       1  
 1       1  
 2       1  
 3       1  
 4       1  ,
    id keyword location                                               text
 0   0     NaN      NaN                 Just happened a terrible car crash
 1   2     NaN      NaN  Heard about #earthquake is different cities, s...
 2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
 3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
 4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan,


In [4]:
train_df.info()
test_df.info()
sub_df.info()

train_df.isnull().sum(), test_df.isnull().sum()
train_df["target"].value_counts(), train_df["target"].value_counts(normalize=True)
train_df["text"].str.len().describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3

count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: text, dtype: float64

In [5]:
text_cols = ["keyword", "location", "text"]

train_df[text_cols] = train_df[text_cols].fillna("missing")
test_df[text_cols]  = test_df[text_cols].fillna("missing")

train_df[text_cols].isnull().sum(), test_df[text_cols].isnull().sum()


(keyword     0
 location    0
 text        0
 dtype: int64,
 keyword     0
 location    0
 text        0
 dtype: int64)

In [7]:
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

In [8]:
def clean_and_lemmatize(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\.\S+", " ", text)  # remove URLs
    text = re.sub(r"<.*?>", " ", text)            # remove HTML tags
    text = re.sub(r"[^a-z\s]", " ", text)         # remove special chars/numbers
    text = re.sub(r"\s+", " ", text).strip()

    tokens = word_tokenize(text)                  # tokenize
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    tokens = [lemm.lemmatize(t) for t in tokens]  # lemmatize

    return " ".join(tokens)


In [9]:
for col in text_cols:
    train_df[col] = train_df[col].apply(clean_and_lemmatize)
    test_df[col]  = test_df[col].apply(clean_and_lemmatize)

train_df[text_cols].head()


Unnamed: 0,keyword,location,text
0,missing,missing,deed reason earthquake may allah forgive
1,missing,missing,forest fire near ronge sask canada
2,missing,missing,resident asked shelter place notified officer ...
3,missing,missing,people receive wildfire evacuation order calif...
4,missing,missing,got sent photo ruby alaska smoke wildfire pour...


In [10]:
train_df["tweet_len"] = train_df["text"].str.len()
test_df["tweet_len"]  = test_df["text"].str.len()

train_df[["text","tweet_len"]].head()


Unnamed: 0,text,tweet_len
0,deed reason earthquake may allah forgive,40
1,forest fire near ronge sask canada,34
2,resident asked shelter place notified officer ...,85
3,people receive wildfire evacuation order calif...,51
4,got sent photo ruby alaska smoke wildfire pour...,54


In [12]:
from sklearn.model_selection import train_test_split
X = train_df[["keyword", "location", "text", "tweet_len"]]
y = train_df["target"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_train.shape, X_valid.shape


((6090, 4), (1523, 4))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

text_features = ColumnTransformer(
    transformers=[
        ("keyword_tfidf",  TfidfVectorizer(ngram_range=(1, 2)), "keyword"),
        ("location_tfidf", TfidfVectorizer(ngram_range=(1, 2)), "location"),
        ("text_tfidf",     TfidfVectorizer(ngram_range=(1, 2)), "text"),
        ("tweet_len",      "passthrough", ["tweet_len"]),
    ],
    remainder="drop"
)


In [17]:
from sklearn.metrics import classification_report, confusion_matrix
lr_clf = Pipeline(
    steps=[
        ("vectorizer", text_features),
        ("model", LogisticRegression(max_iter=2000))
    ]
)

lr_clf.fit(X_train, y_train)
y_pred_lr = lr_clf.predict(X_valid)

print(classification_report(y_valid, y_pred_lr))
print(confusion_matrix(y_valid, y_pred_lr))


              precision    recall  f1-score   support

           0       0.81      0.85      0.83       869
           1       0.79      0.73      0.76       654

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

[[740 129]
 [179 475]]


In [40]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", category=ConvergenceWarning)


In [41]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

svm_clf = Pipeline(
    steps=[
        ("vectorizer", text_features),
        ("model", LinearSVC(dual="auto", max_iter=20000, tol=1e-3))
    ]
)

svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_valid)

print(classification_report(y_valid, y_pred_svm))
print(confusion_matrix(y_valid, y_pred_svm))


              precision    recall  f1-score   support

           0       0.82      0.82      0.82       869
           1       0.76      0.76      0.76       654

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523

[[715 154]
 [160 494]]


In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

nb_clf = Pipeline(
    steps=[
        ("vectorizer", text_features),
        ("model", MultinomialNB())
    ]
)

nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_valid)

print(classification_report(y_valid, y_pred_nb))
print(confusion_matrix(y_valid, y_pred_nb))


              precision    recall  f1-score   support

           0       0.74      0.95      0.83       869
           1       0.89      0.55      0.68       654

    accuracy                           0.78      1523
   macro avg       0.81      0.75      0.75      1523
weighted avg       0.80      0.78      0.76      1523

[[825  44]
 [296 358]]


In [43]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(svm_clf, X, y, cv=cv, scoring="f1")

print("Fold scores:", scores)
print("Mean F1:", scores.mean())
print("Std:", scores.std())


Fold scores: [0.75862069 0.7631786  0.73545384 0.74396135 0.74528302]
Mean F1: 0.7492995001788447
Std: 0.010157206443835578


In [44]:
X_full = train_df[["keyword", "location", "text", "tweet_len"]]
y_full = train_df["target"]
X_test = test_df[["keyword", "location", "text", "tweet_len"]]

svm_clf.fit(X_full, y_full)
test_pred = svm_clf.predict(X_test)

sub_df["target"] = test_pred
sub_df.to_csv("submission.csv", index=False)

sub_df.head(), sub_df["target"].value_counts(), (len(test_pred), len(sub_df))


(   id  target
 0   0       1
 1   2       1
 2   3       1
 3   9       1
 4  11       1,
 target
 0    1978
 1    1285
 Name: count, dtype: int64,
 (3263, 3263))

# NLP Disaster Tweets Classification

## Project overview
This project uses NLP and machine learning to classify tweets as disaster-related (1) or not disaster-related (0) using the Kaggle “NLP Getting Started” dataset.

## Dataset
The dataset includes `train.csv`, `test.csv`, and `sample_submission.csv`.  
Training data contains: `id`, `keyword`, `location`, `text`, `target`. Test data contains: `id`, `keyword`, `location`, `text`.

## Data preprocessing
- Filled missing values in `keyword` and `location` using `"missing"`.
- Cleaned text by lowercasing, removing URLs/HTML tags/special characters, tokenizing, removing stopwords, and lemmatizing.

## Feature extraction
- Used TF-IDF with unigrams and bigrams (`ngram_range=(1,2)`) for `keyword`, `location`, and `text`.
- Added an extra numeric feature: tweet length (`tweet_len`).

## Model training
- Split the data into train/validation with stratification.
- Trained and compared multiple models: Logistic Regression, Multinomial Naive Bayes, and Linear SVM (LinearSVC).
- Used scikit-learn Pipelines to combine feature extraction and modeling.

## Evaluation results
- Linear SVM (tuned) validation accuracy: 0.79
- Class 0 F1-score: 0.82, Class 1 F1-score: 0.76
- Confusion matrix: [[715, 154], [160, 494]]
- 5-fold CV Mean F1: 0.7493, Std: 0.0102

## Final model and submission
The final model (LinearSVC) was trained on the full training set and used to predict the test set.  
A `submission.csv` file was generated in the required Kaggle format (`id`, `target`), with 3263 predictions.
