### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:

# write your code from here
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string

# Load SMS Spam dataset (downloaded from UCI repository or use URL)
# Format: Label \t Text
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Basic exploration
print(df.head())

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits (optional)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = text.strip()
    return text

# Apply preprocessing
df['cleaned_message'] = df['message'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X_tfidf = vectorizer.fit_transform(df['cleaned_message'])

print(f"TF-IDF matrix shape: {X_tfidf.shape}")

# Example: show feature names
print("Sample TF-IDF features:", vectorizer.get_feature_names_out()[:10])


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
TF-IDF matrix shape: (5572, 500)
Sample TF-IDF features: ['able' 'abt' 'account' 'actually' 'address' 'aft' 'afternoon' 'age' 'ah'
 'aight']
