# 3. Hyperparameter tuning and Error Analysis


## 3.1 Hyperparameter Tuning

In [1]:
# Importing libraries, data and clean3 function from Day-2

## Libraries
import re
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## Load train / validation data
train = pd.read_csv('../data/train.csv')

train_df, val_df = train_test_split(
  train,
  test_size=0.20,
  stratify=train['target'],
  random_state=42
)

## Define clean3
def clean3(text):
  text = text.lower() # lowercasing
  text = re.sub(r"#([a-z0-9_]+)", r"\1", text) # Hashtag to plain word
  text = re.sub(r'http\S+', "", text) # removing HTTP. URL
  text = re.sub(r"www\.\S+", "", text) # removing WWW. URL
  text = re.sub(r'@\w+', "", text) # removing @mentions
  text = re.sub(r"[^a-z0-9\s]", " ", text) #r emoving other characters other than a-z, 0-9 and whitespace
  text = re.sub(r"\s+", " ", text).strip() # Changing multiple spaces into one
  return text

##Apply clean3 
train_df['clean3'] = train_df['text'].apply(clean3)
val_df['clean3']   = val_df['text'].apply(clean3)

### 3.1.1 Define `eval_params`

This function trains a TF-IDF → Logistic Regression pipeline on **`train_df['clean3']`** and evaluates on **`val_df['clean3']`**. 
It returns the validation F1 score for the given `C`, `ngram_range`, and `min_df` parameters.


In [2]:

def eval_params(clean_col, C, ngram_range, min_df):

  # 1. Vectorize
  vect = TfidfVectorizer(ngram_range = ngram_range, min_df = min_df)
  X_tr = vect.fit_transform(train_df[clean_col])
  y_tr = train_df['target']

  X_vl = vect.transform(val_df[clean_col])
  y_vl = val_df['target']

  # 2. Train Logistic Regression
  lr = LogisticRegression(C=C, max_iter=1000)
  lr.fit(X_tr, y_tr)

  # 3. Predict & compute F1
  preds = lr.predict(X_vl)
  return f1_score(y_vl, preds)


### 3.1.2 Grid‐Search Loop

We will iterate over all combinations of C, ngram_range, and min_df. 
For each combination, we call `eval_params('clean3', C, ngram_range, min_df)` 
and track the best validation F1 and its corresponding parameters.

In [3]:
Cs = [0.01, 0.1, 1, 10]
ngram_ranges = [(1, 2), (1, 3), (2, 3), (3, 5)]
min_dfs = [1, 2, 5]

best_score = 0.0
best_params = None

for C in Cs:
  for ngram in ngram_ranges:
    for md in min_dfs:
      score = eval_params('clean3', C=C, ngram_range=ngram, min_df=md)
      if score > best_score:
        best_score = score
        best_params = (C, ngram, md)

print("Best validation F1:", best_score)
print("Best params: C =", best_params[0],
      ", ngram_range =", best_params[1],
      ", min_df =", best_params[2])


Best validation F1: 0.7777777777777778
Best params: C = 10 , ngram_range = (1, 3) , min_df = 2


## 3.2 Error Analysis
### 3.2.1 Re-fit on train_df with Best Hyperparameters

We will use:
- C = 10
- ngram_range = (1, 3)
- min_df = 2

Then we’ll obtain predicted probabilities on **val_df['clean3']** (so we can measure how “sure” the model was about each prediction).

In [5]:
import numpy as np

C_best, ngram_best, min_df_best = 10, (1, 3), 2

# Fit TF-IDF on train_df['clean3']
vect_best = TfidfVectorizer(ngram_range=ngram_best, min_df=min_df_best)
x_tr_best = vect_best.fit_transform(train_df['clean3'])
y_tr_best = train_df['target']

# Appluing Logistic regression with C=10
lr_best = LogisticRegression(C=C_best, max_iter=1000)
lr_best.fit(x_tr_best, y_tr_best)

# Transform val_df['clean3'] → get probabilities & predictions
X_vl_best = vect_best.transform(val_df['clean3'])
val_probs = lr_best.predict_proba(X_vl_best)[:, 1] 
val_preds = lr_best.predict(X_vl_best)


### 3.2.2 Compute Confidence Error and find top 20

For each validation tweet:
- If true label=1 ⇒ error_conf = (1 − probability).  
- If true label=0 ⇒ error_conf = (probability).  

We sort descending by error_conf and pick the top 20 tweets the model was most “confident yet wrong” on.

In [6]:
y_vl = val_df['target'].values

# Computing confidence error
error_conf = np.where(y_vl == 1, 1 - val_probs, val_probs)

# Attach to a copy of val_df
val_df_copy = val_df.copy()
val_df_copy['prob'] = val_probs
val_df_copy['pred'] = val_preds
val_df_copy['error_conf'] = error_conf

# Sort by error_conf descending and take top 20
most_wrong = val_df_copy.sort_values(by='error_conf', ascending=False).head(20)

# Display relevant columns
most_wrong[['text', 'clean3', 'target', 'pred', 'prob', 'error_conf']]

Unnamed: 0,text,clean3,target,pred,prob,error_conf
4154,You can never escape me. Bullets don't harm me...,you can never escape me bullets don t harm me ...,1,0,0.003221,0.996779
5435,Maid charged with stealing Dh30000 from police...,maid charged with stealing dh30000 from police...,0,1,0.987849,0.987849
1358,if firefighters acted like cops they'd drive a...,if firefighters acted like cops they d drive a...,0,1,0.983526,0.983526
6108,Do you feel like you are sinking in low self-i...,do you feel like you are sinking in low self i...,1,0,0.017163,0.982837
2905,I can't drown my demons they know how to swim,i can t drown my demons they know how to swim,1,0,0.01876,0.98124
3435,Chick masturbates a guy until she gets explode...,chick masturbates a guy until she gets explode...,1,0,0.022208,0.977792
6317,Keep shape your shoes ??#Amazon #foot #adjust ...,keep shape your shoes amazon foot adjust shape...,1,0,0.024,0.976
895,Bloody insomnia again! Grrrr!! #Insomnia,bloody insomnia again grrrr insomnia,1,0,0.024493,0.975507
6212,@PianoHands You don't know because you don't s...,you don t know because you don t smoke the way...,1,0,0.025282,0.974718
3909,@SatanaOfHell ever seen by far. A dreamy look ...,ever seen by far a dreamy look came over his f...,1,0,0.026304,0.973696


### 3.2.3 Inspect Top 20 Mistakes

Above are the 20 validation tweets our tuned model was most confident about but still misclassified. 
Each row suggests:
- `text` (raw tweet)
- `clean3` (preprocessed text)
- `target` (true label)
- `pred` (model’s predicted label)
- `prob` (predicted P(class=1))
- `error_conf` (how confidently wrong the model was)

Above Table shows that there are three common error types; Missing content, Ambiguous language and Bigram / Trigrams. 
Thus, we will make a short list of disaster words, in order to solve this problem.

In [14]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

disaster_words = [
    'earthquake', 'flood', 'hurricane', 'wildfire', 'tornado',
    'tsunami', 'blizzard', 'volcano', 'landslide', 'avalanche'
]

places = [
    'california', 'texas', 'new york', 'florida', 'london', 'paris',
    'india', 'japan', 'china', 'germany', 'france'
]

train_df['has_kw'] = train_df['clean3'].apply(lambda t: int(any(w in t.split() for w in disaster_words)))
val_df['has_kw']   = val_df['clean3'].apply(lambda t: int(any(w in t.split() for w in disaster_words)))

train_df['has_loc'] = train_df['text'].apply(lambda t: int(any(p in t.lower().split() for p in places)))
val_df['has_loc']   = val_df['text'].apply(lambda t: int(any(p in t.lower().split() for p in places)))

X_tr_text = vect_best.transform(train_df['clean3'])
X_vl_text = vect_best.transform(val_df['clean3'])

X_tr_kwflag = csr_matrix(train_df['has_kw'].values.reshape(-1, 1))
X_vl_kwflag = csr_matrix(val_df['has_kw'].values.reshape(-1, 1))

X_tr_locflag = csr_matrix(train_df['has_loc'].values.reshape(-1, 1))
X_vl_locflag = csr_matrix(val_df['has_loc'].values.reshape(-1, 1))

X_tr_combo = hstack([X_tr_text, X_tr_kwflag])
X_vl_combo = hstack([X_vl_text, X_vl_kwflag])

lr_kw = LogisticRegression(C=C_best, max_iter=1000)
lr_kw.fit(X_tr_combo, train_df['target'])
preds_kw = lr_kw.predict(X_vl_combo)
print("TF-IDF + has_kw Val F1:", f1_score(val_df['target'], preds_kw))

X_tr_combo2 = hstack([X_tr_text, X_tr_kwflag, X_tr_locflag])
X_vl_combo2 = hstack([X_vl_text, X_vl_kwflag, X_vl_locflag])

lr_kwloc = LogisticRegression(C=C_best, max_iter=1000)
lr_kwloc.fit(X_tr_combo2, train_df['target'])
preds_kwloc = lr_kwloc.predict(X_vl_combo2)
print("TF-IDF + has_kw + has_loc Val F1:", f1_score(val_df['target'], preds_kwloc))

results_flags = {
    "TF-IDF only": f1_score(val_df['target'], lr_best.predict(X_vl_best)),
    "TF-IDF + has_kw": f1_score(val_df['target'], preds_kw),
    "TF-IDF + has_kw + has_loc": f1_score(val_df['target'], preds_kwloc)
}
print(results_flags)


TF-IDF + has_kw Val F1: 0.7762128325508607
TF-IDF + has_kw + has_loc Val F1: 0.7756059421422987
{'TF-IDF only': 0.7777777777777778, 'TF-IDF + has_kw': 0.7762128325508607, 'TF-IDF + has_kw + has_loc': 0.7756059421422987}


Now with the 'TF-IDF only' method:


In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# 1) Load test set and apply clean3
test = pd.read_csv('../data/test.csv')
test['clean3'] = test['text'].apply(clean3)

# 2) Fit TF-IDF (1–3 grams, min_df=2) on full train_df['clean3']
vect_full = TfidfVectorizer(ngram_range=(1, 3), min_df=2)
X_full = vect_full.fit_transform(train_df['clean3'])
y_full = train_df['target']

# 3) Train Logistic Regression with C=10
lr_full = LogisticRegression(C=10, max_iter=1000)
lr_full.fit(X_full, y_full)

# 4) Transform test['clean3'] and predict
X_test = vect_full.transform(test['clean3'])
test_preds = lr_full.predict(X_test)

# 5) Build submission and save
submission = pd.DataFrame({
    'id': test['id'],
    'target': test_preds
})
submission.to_csv('submission.csv', index=False)
print("submission.csv created!")

submission.csv created!


submission result was lower than before: 0.79037.

**Takeaway:**  
- All three “extra feature” trials lowered F1.  
- This tells us our baseline TF-IDF + LogReg is already capturing most signal—adding noisy keyword flags or char ngrams without further filtering did more harm than good.  
- Next, we’ll accept the baseline hyperparameters and move to 5-fold CV and blending.