### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:
import pandas as pd
import re
import io
import requests
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Step 1: Download ZIP and read the SMSSpamCollection file inside it
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# Read the specific file inside the ZIP
with z.open('SMSSpamCollection') as f:
    df = pd.read_csv(f, sep='\t', header=None, names=['label', 'message'])

# Step 2: Basic text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_message'] = df['message'].apply(preprocess_text)

# Step 3: Extract TF-IDF features
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(df['clean_message'])

print("TF-IDF feature matrix shape:", X_tfidf.shape)

# Optional: Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, df['label'], test_size=0.2, random_state=42, stratify=df['label'])

print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

# Show some example TF-IDF features
feature_names = tfidf.get_feature_names_out()
print("Top 10 TF-IDF features:", feature_names[:10])

TF-IDF feature matrix shape: (5572, 3000)
Training samples: 4457, Testing samples: 1115
Top 10 TF-IDF features: ['020603' '07xxxxxxxxx' '0800' '08000839402' '08000930705' '08000938767'
 '08001950382' '08002986906' '0808' '0845']
