In [1]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Display first few rows of each dataset
print("Train Data Sample:")
print(train_df.head())

print("\nTest Data Sample:")
print(test_df.head())

Train Data Sample:
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

Test Data Sample:
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in 

In [3]:
# Check class balance
print("Class Distribution:\n", train_df['target'].value_counts())

# Check for missing values
print("\nMissing Values in Train Data:\n", train_df.isnull().sum())
print("\nMissing Values in Test Data:\n", test_df.isnull().sum())

Class Distribution:
 target
0    4342
1    3271
Name: count, dtype: int64

Missing Values in Train Data:
 id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

Missing Values in Test Data:
 id             0
keyword       26
location    1105
text           0
dtype: int64


In [5]:
# relation between location and target
# Fill missing values
train_df['location'] = train_df['location'].fillna('Unknown')

# Normalize case and remove extra spaces
train_df['location'] = train_df['location'].str.strip().str.lower()

# Replace common variations
train_df['location'] = train_df['location'].replace({
    'new york': 'new york, usa',
    'london, uk': 'london',
    'usa': 'united states',
    'canada': 'canada'
}, regex=False)

In [7]:
# Get top 20 most frequent locations
top_locations = train_df['location'].value_counts().nlargest(20).index

# Create binary feature: whether location is in top locations
train_df['is_top_location'] = train_df['location'].apply(lambda loc: loc if loc in top_locations else 'Other')

# One-hot encode this new feature
X_location = pd.get_dummies(train_df['is_top_location'], prefix='loc')

In [9]:
# Join encoded location with target
analysis_df = pd.concat([X_location, train_df['target']], axis=1)

# Group by location and calculate average disaster rate
location_disaster_rate = analysis_df.groupby('target').mean()
print(location_disaster_rate.T)

target                      0         1
loc_Other            0.586826  0.566188
loc_australia        0.002073  0.003057
loc_california       0.002764  0.002751
loc_california, usa  0.001612  0.002446
loc_canada           0.003915  0.005197
loc_chicago, il      0.002303  0.003057
loc_everywhere       0.003224  0.001834
loc_india            0.001382  0.006726
loc_kenya            0.003685  0.001529
loc_london           0.008752  0.006726
loc_los angeles      0.001842  0.002446
loc_los angeles, ca  0.004606  0.002446
loc_mumbai           0.001152  0.005809
loc_new york, ny     0.002303  0.002140
loc_new york, usa    0.014509  0.005809
loc_nigeria          0.001612  0.008560
loc_uk               0.002994  0.005197
loc_united states    0.014049  0.028737
loc_unknown          0.336251  0.328951
loc_washington, dc   0.001382  0.004586
loc_worldwide        0.002764  0.005809


In [11]:
from sklearn.preprocessing import OneHotEncoder

# Get top N locations
top_locations = train_df['location'].value_counts().nlargest(20).index

# Replace others with 'Other'
train_df['location_top'] = train_df['location'].apply(lambda x: x if x in top_locations else 'Other')
test_df['location_top'] = test_df['location'].apply(lambda x: x if x in top_locations else 'Other')

# One-hot encode
encoder = OneHotEncoder(sparse_output=True)
X_location = encoder.fit_transform(train_df[['location_top']])
X_test_location = encoder.transform(test_df[['location_top']])

In [51]:
!pip install sentence-transformers xgboost scikit-learn pandas numpy nltk

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading sentence_transformers-4.1.0

In [77]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from scipy.sparse import hstack, csr_matrix
from sentence_transformers import SentenceTransformer
import joblib
import warnings 

# Ignore warnings
warnings.filterwarnings("ignore")

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


# --- Helper Functions ---
def preprocess_text(text):
    """Cleans tweet text by removing URLs, mentions, hashtags, and stopwords."""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = [word for word in text.split() if word not in stop_words]
    return ' '.join(words)


def get_top_categories(df_col, top_n=20):
    """Returns top N categories from a column for encoding"""
    return df_col.value_counts().nlargest(top_n).index


def encode_bert(model, texts):
    """Encodes list of texts using BERT model and returns sparse matrix"""
    return csr_matrix(model.encode(texts, show_progress_bar=True))


# --- Text Preprocessing ---
print("Preprocessing text...")
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

# --- Generate BERT Embeddings ---
print("Generating BERT embeddings...")
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

X_train_bert_sparse = encode_bert(bert_model, train_df['cleaned_text'].tolist())
X_test_bert_sparse = encode_bert(bert_model, test_df['cleaned_text'].tolist())

# --- Location Feature Engineering ---
print("Processing location features...")
train_df['location'] = train_df['location'].fillna('Unknown').str.strip().str.lower()
test_df['location'] = test_df['location'].fillna('Unknown').str.strip().str.lower()

top_locations = get_top_categories(train_df['location'], 20)
train_df['location_top'] = train_df['location'].apply(lambda x: x if x in top_locations else 'Other')
test_df['location_top'] = test_df['location'].apply(lambda x: x if x in top_locations else 'Other')

encoder = OneHotEncoder(sparse_output=True)
X_location_train = encoder.fit_transform(train_df[['location_top']])
X_location_test = encoder.transform(test_df[['location_top']])

# --- Keyword Feature Engineering ---
print("Processing keyword features...")
train_df['keyword'] = train_df['keyword'].fillna('missing_keyword').str.strip().str.lower()
test_df['keyword'] = test_df['keyword'].fillna('missing_keyword').str.strip().str.lower()

X_keyword_train = encode_bert(bert_model, train_df['keyword'].tolist())
X_keyword_test = encode_bert(bert_model, test_df['keyword'].tolist())

# --- New Features: Text Length ---
print("Adding text length feature...")
train_df['text_len'] = train_df['cleaned_text'].apply(len)
test_df['text_len'] = test_df['cleaned_text'].apply(len)

X_len_train = csr_matrix(train_df[['text_len']])
X_len_test = csr_matrix(test_df[['text_len']])

# --- Combine All Features ---
print("Combining all features...")
X_train_final = hstack([
    X_train_bert_sparse,
    X_location_train,
    X_keyword_train,
    X_len_train
])

X_test_final = hstack([
    X_test_bert_sparse,
    X_location_test,
    X_keyword_test,
    X_len_test
])

# Target
y_train = train_df['target']

# --- Train Models ---
print("Training models...")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1.3)
lr_model = LogisticRegression(class_weight='balanced')

xgb_model.fit(X_train_final, y_train)
lr_model.fit(X_train_final, y_train)

# Get probabilities
xgb_proba = xgb_model.predict_proba(X_train_final)[:, 1]
lr_proba = lr_model.predict_proba(X_train_final)[:, 1]

# Ensemble: average probability
avg_proba = (xgb_proba + lr_proba) / 2

# Tune threshold for best F1
best_f1 = 0
best_thresh = 0.5
for thresh in np.arange(0.1, 0.9, 0.05):
    y_ens = (avg_proba >= thresh).astype(int)
    current_f1 = f1_score(y_train, y_ens)
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_thresh = thresh

print("Best Threshold:", best_thresh)
print("Ensemble F1 Score:", best_f1)

# --- Predict on Test Set Using Ensemble ---
print("Making predictions on test data...")
test_xgb_proba = xgb_model.predict_proba(X_test_final)[:, 1]
test_lr_proba = lr_model.predict_proba(X_test_final)[:, 1]

final_proba = (test_xgb_proba + test_lr_proba) / 2
final_preds = (final_proba >= best_thresh).astype(int)

# --- Prepare Submission ---
print("Saving submission file...")
submission = pd.DataFrame({
    "id": test_df["id"],
    "target": final_preds
})

submission.to_csv("submission.csv", index=False)
print("Submission file saved with ensemble predictions!")

# Optional: Save models and encoders
joblib.dump(xgb_model, 'xgb_ensemble_model.pkl')
joblib.dump(lr_model, 'lr_ensemble_model.pkl')
joblib.dump(encoder, 'location_encoder.pkl')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing text...
Generating BERT embeddings...


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Processing location features...
Processing keyword features...


Batches:   0%|          | 0/238 [00:00<?, ?it/s]

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

Adding text length feature...
Combining all features...
Training models...
Best Threshold: 0.5000000000000001
Ensemble F1 Score: 0.976965601965602
Making predictions on test data...
Saving submission file...
Submission file saved with ensemble predictions!


['location_encoder.pkl']

In [65]:
import os
print(os.listdir())

['.anaconda', '.cache', '.conda', '.condarc', '.continuum', '.idlerc', '.ipynb_checkpoints', '.ipython', '.jupyter', '.keras', '.matplotlib', '.spyder-py3', 'anaconda3', 'anaconda_projects', 'AppData', 'Application Data', 'Contacts', 'Cookies', 'disaster_tweet_model.pkl', 'Documents', 'Downloads', 'Favorites', 'Fraud Prediction.ipynb', 'Fraud.csv', 'IntelGraphicsProfiles', 'Links', 'Local Settings', 'Music', 'My Documents', 'NetHood', 'NLP Disaster Tweets.ipynb', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TM.blf', 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{a2332f18-cdbf-11ec-8680-002248483d79}.TMContainer00000000000000000002.regtrans-ms', 'ntuser.ini', 'OneDrive', 'PrintHood', 'Recent', 'Saved Games', 'Searches', 'SendTo', 'Start Menu', 'submission.csv', 'Templates', 'test.csv', 'train.csv', 'Videos']


In [69]:
from IPython.display import display, FileLink
display(FileLink("submission.csv"))