In [20]:
# TODO 1: Setup and Import Everything
import sys
import os
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"‚úÖ Added to path: {project_root}")

# Import your model
from src.models.baseline_model import DisasterTweetClassifier

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("\n" + "="*50)
print("SETUP COMPLETE - Ready for predictions!")
print("="*50)


Using device: cpu

SETUP COMPLETE - Ready for predictions!


In [21]:
# TODO 2: Load Vocabulary and Test Data

print("Loading vocabulary...")
vocab_dict = pickle.load(open('../data/processed/vocab_dict.pkl', 'rb'))
vocab_size = len(vocab_dict)
print(f"‚úÖ Vocabulary loaded: {vocab_size} words")

print("\nLoading test data...")
test_df = pd.read_csv("../data/interim/test_cleaned.csv")
print(f"‚úÖ Test data loaded: {len(test_df)} samples")
print(f"Columns: {test_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(test_df.head())


Loading vocabulary...
‚úÖ Vocabulary loaded: 14646 words

Loading test data...
‚úÖ Test data loaded: 3263 samples
Columns: ['id', 'keyword', 'location', 'text', 'has_hashtag', 'has_mention', 'has_url', 'number_urls', 'number_hashtags', 'number_mentions', 'word_count', 'text_clean', 'emojis_text', 'word_count_difference']

First few rows:
   id keyword location                                               text  \
0   0     NaN      NaN                 Just happened a terrible car crash   
1   2     NaN      NaN  Heard about #earthquake is different cities, s...   
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...   
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires   
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan   

   has_hashtag  has_mention  has_url  number_urls  number_hashtags  \
0        False        False    False            0                0   
1         True        False    False            0        

In [22]:
# TODO 3: Recreate Helper Functions (Same as Training!)

# Text-to-sequence function (MUST be identical to training!)
def text_to_sequence(text, vocab_dict, max_length=50, pad_idx=0, unk_idx=1):
    """Convert text to sequence of indices - SAME AS TRAINING!"""
    words = text.lower().split()
    sequence = [vocab_dict.get(word, unk_idx) for word in words]
    
    if len(sequence) < max_length:
        sequence.extend([pad_idx] * (max_length - len(sequence)))
    else:
        sequence = sequence[:max_length]
    
    return sequence

# Dataset class (SAME AS TRAINING!)
class DisasterTweetsDataset(Dataset):
    def __init__(self, texts, vocab_dict, max_length=50):
        self.texts = texts
        self.vocab_dict = vocab_dict
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        return text

# Collate function (SAME AS TRAINING!)
def collate_fn(batch, vocab_dict, max_length):
    sequences = [text_to_sequence(text, vocab_dict, max_length) for text in batch]
    sequences = torch.tensor(sequences)
    return sequences

print("‚úÖ Helper functions defined (identical to training)")
print("‚úÖ Ready to create test DataLoader")


‚úÖ Helper functions defined (identical to training)
‚úÖ Ready to create test DataLoader


In [23]:
# TODO 4: Create Test DataLoader

print("Creating test DataLoader...")

# Create DataLoader (no labels for test data!)
test_loader = DataLoader(
    DisasterTweetsDataset(test_df["text_clean"], vocab_dict, max_length=50),
    batch_size=32,
    shuffle=False,  # IMPORTANT: Don't shuffle test data!
    collate_fn=lambda batch: collate_fn(batch, vocab_dict, max_length=50)
)

print(f"‚úÖ Test DataLoader created")
print(f"Number of batches: {len(test_loader)}")
print(f"Batch size: 32")
print(f"Total samples: {len(test_df)}")

# Test the loader
for batch in test_loader:
    print(f"‚úÖ Batch shape: {batch.shape}")
    break


Creating test DataLoader...
‚úÖ Test DataLoader created
Number of batches: 102
Batch size: 32
Total samples: 3263
‚úÖ Batch shape: torch.Size([32, 50])


In [24]:
# TODO 5: Load Trained Model

print("Loading trained model...")

# Model hyperparameters (MUST match training!)
embedding_dim = 50   # Same as training
hidden_dim = 64      # Same as training

# Initialize model with same architecture
model = DisasterTweetClassifier(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    padding_idx=0
)

# Load trained weights
model_path = '../src/models/disaster_tweet_classifier.pth'
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)

# IMPORTANT: Set to evaluation mode!
model.eval()

print(f"‚úÖ Model loaded from: {model_path}")
print(f"‚úÖ Model set to evaluation mode")
print(f"Model architecture:")
print(model)


Loading trained model...
‚úÖ Model loaded from: ../src/models/disaster_tweet_classifier.pth
‚úÖ Model set to evaluation mode
Model architecture:
DisasterTweetClassifier(
  (embedding): Embedding(14646, 50, padding_idx=0)
  (fc1): Linear(in_features=50, out_features=64, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.6, inplace=False)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


In [25]:
# TODO 6: Generate Predictions on Test Set

print("Generating predictions on test set...")
print("This may take a minute...")

# Storage for predictions
all_predictions = []

# No gradient computation needed for inference
with torch.no_grad():
    for batch_idx, texts in enumerate(test_loader):
        # Move batch to device
        texts = texts.to(device)
        
        # Forward pass
        outputs = model(texts)
        
        # Convert logits to probabilities, then to binary predictions
        # sigmoid(output) > 0.5 ‚Üí class 1 (disaster), else ‚Üí class 0
        probabilities = torch.sigmoid(outputs.squeeze())
        predictions = (probabilities > 0.5).int()
        
        # Store predictions
        all_predictions.extend(predictions.cpu().numpy())
        
        # Print progress every 20 batches
        if (batch_idx + 1) % 20 == 0:
            print(f"Processed {(batch_idx + 1) * 32} / {len(test_df)} samples...")

print(f"\n‚úÖ Predictions complete!")
print(f"Total predictions: {len(all_predictions)}")
print(f"Predictions distribution:")
print(f"  - Disaster (1): {sum(all_predictions)}")
print(f"  - Not Disaster (0): {len(all_predictions) - sum(all_predictions)}")
print(f"  - Disaster ratio: {sum(all_predictions) / len(all_predictions):.2%}")


Generating predictions on test set...
This may take a minute...
Processed 640 / 3263 samples...
Processed 1280 / 3263 samples...
Processed 1920 / 3263 samples...


Processed 2560 / 3263 samples...
Processed 3200 / 3263 samples...

‚úÖ Predictions complete!
Total predictions: 3263
Predictions distribution:
  - Disaster (1): 1275
  - Not Disaster (0): 1988
  - Disaster ratio: 39.07%


In [26]:
# TODO 7: Create Submission DataFrame

print("Creating submission file...")

# Load original test data to get the 'id' column
original_test = pd.read_csv("../data/raw/test.csv")

# Create submission DataFrame with required format
submission = pd.DataFrame({
    'id': original_test['id'],
    'target': all_predictions
})

print(f"‚úÖ Submission DataFrame created")
print(f"\nSubmission shape: {submission.shape}")
print(f"Columns: {submission.columns.tolist()}")
print(f"\nFirst few rows:")
print(submission.head(10))
print(f"\nLast few rows:")
print(submission.tail(10))


Creating submission file...
‚úÖ Submission DataFrame created

Submission shape: (3263, 2)
Columns: ['id', 'target']

First few rows:
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1
5  12       1
6  21       0
7  22       0
8  27       0
9  29       0

Last few rows:
         id  target
3253  10838       1
3254  10845       1
3255  10856       1
3256  10857       0
3257  10858       1
3258  10861       0
3259  10865       1
3260  10868       1
3261  10874       1
3262  10875       1


In [27]:
# TODO 8: Validate Submission Format

print("Validating submission format...")
print("="*50)

# Check 1: Correct columns
expected_columns = ['id', 'target']
if list(submission.columns) == expected_columns:
    print("‚úÖ Columns correct: ['id', 'target']")
else:
    print(f"‚ùå ERROR: Expected {expected_columns}, got {list(submission.columns)}")

# Check 2: Correct number of rows
expected_rows = len(original_test)
if len(submission) == expected_rows:
    print(f"‚úÖ Row count correct: {expected_rows}")
else:
    print(f"‚ùå ERROR: Expected {expected_rows} rows, got {len(submission)}")

# Check 3: No missing values
if submission.isnull().sum().sum() == 0:
    print("‚úÖ No missing values")
else:
    print(f"‚ùå ERROR: Found missing values:\n{submission.isnull().sum()}")

# Check 4: Target values are 0 or 1
unique_targets = submission['target'].unique()
if set(unique_targets).issubset({0, 1}):
    print(f"‚úÖ Target values correct: {sorted(unique_targets)}")
else:
    print(f"‚ùå ERROR: Target should be 0 or 1, found: {unique_targets}")

# Check 5: ID column matches original
if submission['id'].equals(original_test['id']):
    print("‚úÖ ID column matches original test data")
else:
    print("‚ùå WARNING: ID column mismatch")

print("="*50)
print("‚úÖ ALL VALIDATION CHECKS PASSED!")
print("Ready to save and submit to Kaggle!")


Validating submission format...
‚úÖ Columns correct: ['id', 'target']
‚úÖ Row count correct: 3263
‚úÖ No missing values
‚úÖ Target values correct: [0, 1]
‚úÖ ID column matches original test data
‚úÖ ALL VALIDATION CHECKS PASSED!
Ready to save and submit to Kaggle!


In [28]:
# TODO 9: Save Submission File

# Create submissions directory if it doesn't exist
os.makedirs('../data/submissions', exist_ok=True)

# Save submission file
submission_path = '../data/submissions/submission.csv'
submission.to_csv(submission_path, index=False)

print(f"‚úÖ Submission saved to: {submission_path}")
print(f"\nFile size: {os.path.getsize(submission_path) / 1024:.2f} KB")
print(f"\n{'='*50}")
print("üéâ SUBMISSION FILE READY!")
print("='*50}")
print("\nNext steps:")
print("1. Go to: https://www.kaggle.com/competitions/nlp-getting-started/submit")
print("2. Click 'Submit Predictions'")
print("3. Upload: submissions/submission.csv")
print("4. Wait for your score!")
print("\nüìä Expected F1-Score: ~0.76-0.78 (based on validation)")
print("üéØ Your validation accuracy was 80%, so expect similar on test set!")


‚úÖ Submission saved to: ../data/submissions/submission.csv

File size: 22.21 KB

üéâ SUBMISSION FILE READY!
='*50}

Next steps:
1. Go to: https://www.kaggle.com/competitions/nlp-getting-started/submit
2. Click 'Submit Predictions'
3. Upload: submissions/submission.csv
4. Wait for your score!

üìä Expected F1-Score: ~0.76-0.78 (based on validation)
üéØ Your validation accuracy was 80%, so expect similar on test set!


In [29]:
# OPTIONAL: Analyze Some Predictions

print("Sample predictions to review:")
print("="*80)

# Show 10 random samples with their predictions
sample_indices = np.random.choice(len(submission), 10, replace=False)

for idx in sample_indices:
    tweet_id = submission.iloc[idx]['id']
    prediction = submission.iloc[idx]['target']
    original_text = original_test[original_test['id'] == tweet_id]['text'].values[0]
    cleaned_text = test_df.iloc[idx]['text_clean']
    
    pred_label = "üö® DISASTER" if prediction == 1 else "‚úÖ NOT DISASTER"
    
    print(f"\nID: {tweet_id} | Prediction: {pred_label}")
    print(f"Original: {original_text[:100]}...")
    print(f"Cleaned:  {cleaned_text[:100]}...")
    print("-"*80)


Sample predictions to review:

ID: 4390 | Prediction: üö® DISASTER
Original: 1.9 magnitude #earthquake. 16 km from Anchorage AK #UnitedStates http://t.co/wVu08yJGOK...
Cleaned:  magnitude earthquake anchorage unitedstates...
--------------------------------------------------------------------------------

ID: 9968 | Prediction: ‚úÖ NOT DISASTER
Original: TheDIYHacks: RT Illusionimagess: A tsunami cloud. http://t.co/H6t4x1EVTx...
Cleaned:  thediyhacks tsunami cloud...
--------------------------------------------------------------------------------

ID: 6205 | Prediction: ‚úÖ NOT DISASTER
Original: Remove http://t.co/GYcBuXfYGP Browser Hijack... http://t.co/FuQH0yKsdg...
Cleaned:  remove browser hijack...
--------------------------------------------------------------------------------

ID: 10196 | Prediction: üö® DISASTER
Original: Violent Forces Radio: Now Playing Axegressor - Psalm Before the Storm
TuneIn Player @ http://t.co/Xs...
Cleaned:  violent forces radio playing axegressor ps

---

## üéâ Congratulations!

You've completed the full machine learning pipeline:

### ‚úÖ What You Accomplished:
1. **Data Exploration** - Understood your dataset deeply
2. **Preprocessing** - Built a robust text cleaning pipeline
3. **Feature Engineering** - Created vocabulary and data loaders
4. **Model Architecture** - Designed a custom PyTorch classifier
5. **Training** - Achieved 80% validation accuracy through systematic tuning
6. **Evaluation** - Comprehensive metrics analysis
7. **Submission** - Generated predictions for Kaggle!

### üìä Your Model Performance:
- **Validation Accuracy**: 80%
- **F1-Score**: 0.76
- **Precision**: 78%
- **Recall**: 74%

### üéØ What to Expect on Kaggle:
- Your test score should be close to your validation score (~76-78% F1)
- If it's significantly different, there might be:
  - Different data distribution
  - Overfitting (if much worse)
  - Lucky predictions (if much better)

### üöÄ Next Steps:
1. **Submit and get your baseline score**
2. **Analyze the leaderboard position**
3. **Decide on improvements:**
   - Try different architectures (LSTM, GRU)
   - Experiment with pre-trained embeddings (GloVe, Word2Vec)
   - Move to Phase 2: **Transformers** (BERT, RoBERTa) for potential +5-10% boost!

### üìö Learning Achievements:
You now understand:
- End-to-end ML pipeline
- Text preprocessing for NLP
- PyTorch model development
- Training loop implementation
- Hyperparameter tuning
- Overfitting diagnosis and fixes
- Model evaluation and metrics

**This is professional-level ML engineering work!** üéìüí™

---

## üí° Reflection Questions:
1. How did your model perform compared to your expectations?
2. What was the hardest part of this project?
3. What would you try differently next time?
4. Are you ready to try Transformers and beat your own score?

**Good luck with your submission!** üçÄüöÄ


# üöÄ Kaggle Submission - Make Predictions on Test Set

## üìö Overview

Time to make predictions and submit to Kaggle! You'll:
- Load your trained model
- Preprocess test data
- Generate predictions
- Create submission file
- Submit to Kaggle competition

## üéØ Learning Objectives

1. **Load trained model** from saved state
2. **Apply preprocessing** to new, unseen test data
3. **Generate predictions** in batch mode
4. **Create Kaggle submission** format
5. **Submit and get feedback** from leaderboard

---

## TODO 1: Load Trained Model and Test Data

- Load your best model from `../models/disaster_classifier.pth`
- Load test data from `../data/raw/test.csv`
- Load your vocabulary (vocab_dict)
- Set model to eval mode

---

## TODO 2: Preprocess Test Data

Apply the SAME preprocessing as training data:
- Use functions from `01_preprocessing.ipynb`
- Convert text to sequences using your vocabulary
- Handle unknown words with `<UNK>` token
- Pad/truncate to same max_length (50)
- Create DataLoader (no shuffle needed)

**Critical**: Preprocessing must match training exactly!

---

## TODO 3: Generate Predictions

```python
model.eval()
predictions = []

with torch.no_grad():
    for texts in test_loader:
        outputs = model(texts)
        preds = (torch.sigmoid(outputs) > 0.5).int()
        predictions.extend(preds.cpu().numpy())
```

---

## TODO 4: Create Submission File

Kaggle expects this format:
```
id,target
0,1
2,1
3,1
9,0
...
```

```python
submission = pd.DataFrame({
    'id': test_df['id'],
    'target': predictions
})
submission.to_csv('../submissions/submission.csv', index=False)
```

---

## TODO 5: Validate Submission Format

Check before submitting:
- Correct number of rows (3,263 for this competition)
- Columns are 'id' and 'target'
- No missing values
- target is 0 or 1 only

---

## TODO 6: Submit to Kaggle üéâ

1. Go to competition page
2. Click "Submit Predictions"
3. Upload your `submission.csv`
4. Wait for score!

**Competition metric**: F1-Score

---

## üí° Next Steps

After your first submission:
- Analyze which tweets were misclassified
- Try different architectures (LSTM, GRU)
- Experiment with hyperparameters
- Move to Phase 2: HuggingFace Transformers!
