# 4. Five-fold Cross-Validation and Blending

**Objective**: Move from a single train / validation split to 5-fold stratified cross-validation for more reliable predictions.


## 4.1. Setup and Data Cleaning

In [1]:
# Setup and data cleaning

## imports
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

## Loading the data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
sample_submission_df = pd.read_csv('../data/sample_submission.csv')

## Define clean3
def clean3(text):
  text = text.lower() # lowercasing
  text = re.sub(r"#([a-z0-9_]+)", r"\1", text) # Hashtag to plain word
  text = re.sub(r'http\S+', "", text) # removing HTTP. URL
  text = re.sub(r"www\.\S+", "", text) # removing WWW. URL
  text = re.sub(r'@\w+', "", text) # removing @mentions
  text = re.sub(r"[^a-z0-9\s]", " ", text) #removing other characters other than a-z, 0-9 and whitespace
  text = re.sub(r"\s+", " ", text).strip() # Changing multiple spaces into one
  return text

##Apply clean3 
train_df['clean_text'] = train_df['text'].apply(clean3)
test_df['clean_text']   = test_df['text'].apply(clean3)
print("Data loaded and cleaned successfully!")


Data loaded and cleaned successfully!


## 4.2. Configure Cross-Validation Strategy

Here, we define our constants and modeling parameters.
- `N_SPLITS = 5`: We'll be doing 5-fold CV.
- `StratifiedKFold`: We use this specific CV method to ensure the percentage of disaster vs. non-disaster tweets is the same in each fold as it is in the overall dataset.
- `random_state=42`: This guarantees that the "shuffle" is the same every time we run the code, making our results reproducible.

In [2]:
# Configuration

TARGET = 'target'
N_SPLITS = 5
BEST_PARAMS = {'C':1.0, 'ngram_range': (1,2), 'min_df': 2}

# Initializing the splitter
skf = StratifiedKFold(n_splits = N_SPLITS, shuffle = True, random_state = 42)

# Creating empty arrays to store predictions
oof_preds = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))

print("CV strategy and prediction arrays are set up.")


CV strategy and prediction arrays are set up.


## 4.3. Run the 5-Fold CV Loop

In [3]:
# Creating CV loop

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df[TARGET])):
  print(f"===== FOLD {fold+1} =====")

  # spliting data
  X_train_fold = train_df.loc[train_idx, 'clean_text']
  y_train_fold = train_df.loc[train_idx, TARGET]
  X_val_fold = train_df.loc[val_idx, 'clean_text']

  # Fitting the vectorizer on the training data 
  vectorizer = TfidfVectorizer(
    ngram_range = BEST_PARAMS['ngram_range'],
    min_df=BEST_PARAMS['min_df']
  )
  X_train_fold_vec = vectorizer.fit_transform(X_train_fold)

  # Using the fitted vectorizer to transform the validation and test sets
  X_val_fold_vec = vectorizer.transform(X_val_fold)
  X_test_vec = vectorizer.transform(test_df['clean_text'])

  # Training the model
  model = LogisticRegression(C=BEST_PARAMS['C'], solver='liblinear', random_state=42)
  model.fit(X_train_fold_vec, y_train_fold)

  # Predicting on validation data and storing in our OOF array
  val_preds_proba = model.predict_proba(X_val_fold_vec)[:, 1]
  oof_preds[val_idx] = val_preds_proba

   # Predict on test data and add to our running average
  test_preds += model.predict_proba(X_test_vec)[:, 1] / N_SPLITS
    
print("\nCV loop finished!")

===== FOLD 1 =====
===== FOLD 2 =====
===== FOLD 3 =====
===== FOLD 4 =====
===== FOLD 5 =====

CV loop finished!


## 4.4. Evaluate Performance and Create Submission

In [None]:
# Evaluate OOF Predictions 
oof_f1 = f1_score(y_true=train_df[TARGET], y_pred=(oof_preds > 0.5).astype(int))
print(f"Overall OOF F1 Score: {oof_f1:.5f}")

# Create Submission File
submission_df = sample_submission_df.copy()
submission_df[TARGET] = (test_preds > 0.5).astype(int)
submission_df.to_csv('submission_blend.csv', index=False)

print("Submission file 'submission_blend.csv' created successfully!")
submission_df.head()

Overall OOF F1 Score: 0.74529
Submission file 'submission_blend.csv' created successfully!


Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


- Result: 0.80110 in Kaggle submission