# 2. Data preprocessing and TFIDF

In [1]:
# Loading the Data 
import pandas as pd
from sklearn.model_selection import train_test_split

import re

train = pd.read_csv('../data/train.csv')

train_df, val_df = train_test_split(
  train,
  test_size=0.20,
  stratify=train['target'],
  random_state=42
)

# Checking the code
print(train_df.shape)
print(val_df.shape)

(6090, 5)
(1523, 5)


## 2.1 Comparing two methods of Cleaning

In [2]:
# 1. Regex-Based Cleaning

def clean1(text):
  text = text.lower() # lowercasing
  text = re.sub(r'http\S+', '', text) #removing the URL
  text = re.sub(r'@\w+','',text) #removing the @mentions
  text = re.sub(r'[^a-z0-9\s]',' ',text) #removing punctuation
  text = re.sub(r'\s+', ' ', text).strip() # Changing multiple spaces into one
  return text

# 2. Using the tweet-preprocessor Library
import preprocessor as tp

def clean2(text):
  cleaned = tp.clean(text)
  return cleaned.lower()

In [3]:
# Comparing two methods

train_df['clean1'] = train_df['text'].apply(clean1)
train_df['clean2'] = train_df['text'].apply(clean2)

val_df['clean1']   = val_df['text'].apply(clean1)
val_df['clean2']   = val_df['text'].apply(clean2)

sample = train_df[['text', 'clean1', 'clean2']].sample(5, random_state=42)
sample.reset_index(drop=True)

Unnamed: 0,text,clean1,clean2
0,.@wwp is serving more than 75k veterans. 52k O...,is serving more than 75k veterans 52k oif oef ...,. is serving more than k veterans. k oif/oef v...
1,@RetiredFilth people in sydney woke up to the ...,people in sydney woke up to the whole sky bein...,people in sydney woke up to the whole sky bein...
2,South Side factory where worker electrocuted p...,south side factory where worker electrocuted p...,south side factory where worker electrocuted p...
3,New #Free #Porn #Clip! Taking Of Hostages Dang...,new free porn clip taking of hostages dangerou...,new ! taking of hostages dangerous for favors ...
4,The Art World's Seismic Shift Back to the Oddb...,the art world s seismic shift back to the oddb...,the art world's seismic shift back to the oddb...


- Clean2 seems good at removing generic noise, such as numbers or hashtags, compare to clean 1 which preserves those informations.
- However, Clean1 might introduce some noise beyond @mentions or URLs since it is more brute-force method. 
- For this challenge, however, preserving disaster-related keywords as well as removing noise would be important. Thus, we could make Clean 3 based on this result. 

  - For this new cleaning method, we should
    - Keep the word behind a hashtag
    - Remove URLs and @mentions
    - Preserve alphanumeric tokens
    - Lowercasing
  

In [5]:
# Clean3 based on Regex Cleaner
def clean3(text):
  text = text.lower() # lowercasing
  text = re.sub(r"#([a-z0-9_]+)", r"\1", text) # Hashtag to plain word
  text = re.sub(r'http\S+', "", text) # removing HTTP. URL
  text = re.sub(r"www\.\S+", "", text) # removing WWW. URL
  text = re.sub(r'@\w+', "", text) # removing @mentions
  text = re.sub(r"[^a-z0-9\s]", " ", text) #r emoving other characters other than a-z, 0-9 and whitespace
  text = re.sub(r"\s+", " ", text).strip() # Changing multiple spaces into one
  return text

# Applying clean 3
train_df['clean3'] = train_df['text'].apply(clean3)
val_df['clean3']   = val_df['text'].apply(clean3)

#comparing Clean 3
sample = train_df[['text', 'clean1', 'clean2', 'clean3']].sample(5, random_state=42)
sample.reset_index(drop=True)

Unnamed: 0,text,clean1,clean2,clean3
0,.@wwp is serving more than 75k veterans. 52k O...,is serving more than 75k veterans 52k oif oef ...,. is serving more than k veterans. k oif/oef v...,is serving more than 75k veterans 52k oif oef ...
1,@RetiredFilth people in sydney woke up to the ...,people in sydney woke up to the whole sky bein...,people in sydney woke up to the whole sky bein...,people in sydney woke up to the whole sky bein...
2,South Side factory where worker electrocuted p...,south side factory where worker electrocuted p...,south side factory where worker electrocuted p...,south side factory where worker electrocuted p...
3,New #Free #Porn #Clip! Taking Of Hostages Dang...,new free porn clip taking of hostages dangerou...,new ! taking of hostages dangerous for favors ...,new free porn clip taking of hostages dangerou...
4,The Art World's Seismic Shift Back to the Oddb...,the art world s seismic shift back to the oddb...,the art world's seismic shift back to the oddb...,the art world s seismic shift back to the oddb...


In [12]:
# Vectorize and Train TF-IDF for all cleaning methods

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

def evaluate_cleaner (clean_col):
  # Training vectorizer 
  vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
  X_tr = vectorizer.fit_transform(train_df[clean_col])
  y_tr = train_df['target']

  # Transform validation set
  X_vl = vectorizer.transform(val_df[clean_col])
  Y_vl = val_df['target']

  # Fit logistic regression
  model = LogisticRegression(max_iter=1000)
  model.fit(X_tr, y_tr)

  # Predict on validation and compute F1
  preds = model.predict(X_vl)
  return f1_score(Y_vl, preds), vectorizer, model

# Evaluating all three cleaners
results = {}

for col in ['clean1', 'clean2', 'clean3']:
    f1_val, vect_obj, model_obj = evaluate_cleaner(col) 
    results[col] = f1_val

print("Validation F1 scores by cleaner:", results)

Validation F1 scores by cleaner: {'clean1': 0.772162386081193, 'clean2': 0.7612687813021702, 'clean3': 0.772162386081193}


- clean1 and clean3 were tied at 0.772, whilst clean 2 was slightly lower score at 0.761.
- We could guess that this is due to clean2 dropping numbers and hashtags.
- Going forward with clean3 would be better, since it is designed to remove noise that is irrelevant. 

## 2.2 Retraining and Making first submission

In [14]:
test = pd.read_csv('../data/test.csv')

test['clean3'] = test['text'].apply(clean3)

vect_full = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
X_full = vect_full.fit_transform(train_df['clean3'])
y_full = train_df['target']

lr_full = LogisticRegression(max_iter=1000)
lr_full.fit(X_full, y_full)

X_test = vect_full.transform(test['clean3'])
test_preds = lr_full.predict(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'target': test_preds
})
submission.to_csv('submission.csv', index=False)
print("submission.csv created!")

submission.csv created!


- Result: 0.79711 in Kaggle submission