### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:
# write your code from here
!pip install pandas scikit-learn nltk


Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting regex>=2021.8.3
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m781.7/781.7 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('stopwords')

# Step 1: Load SMS Spam dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=["label", "message"])

# Step 2: Encode labels (ham=0, spam=1)
df["label"] = LabelEncoder().fit_transform(df["label"])

# Step 3: Text preprocessing
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  # remove stopwords + stem
    return " ".join(tokens)

df["cleaned"] = df["message"].apply(preprocess_text)

# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned"], df["label"], test_size=0.2, random_state=42
)

# Step 5: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 6: Show example features
print("\n🔤 Sample TF-IDF Features:")
print(pd.DataFrame(X_train_tfidf.toarray(), columns=vectorizer.get_feature_names_out()).head())


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



🔤 Sample TF-IDF Features:
   0800  08000839402  08000930705   10       100  1000  10p  10pmin  11mth  \
0   0.0          0.0          0.0  0.0  0.366331   0.0  0.0     0.0    0.0   
1   0.0          0.0          0.0  0.0  0.000000   0.0  0.0     0.0    0.0   
2   0.0          0.0          0.0  0.0  0.000000   0.0  0.0     0.0    0.0   
3   0.0          0.0          0.0  0.0  0.000000   0.0  0.0     0.0    0.0   
4   0.0          0.0          0.0  0.0  0.000000   0.0  0.0     0.0    0.0   

    12  ...  year  yep  yesterday  yet   yo  youll  your  youv   yr  yup  
0  0.0  ...   0.0  0.0        0.0  0.0  0.0    0.0   0.0   0.0  0.0  0.0  
1  0.0  ...   0.0  0.0        0.0  0.0  0.0    0.0   0.0   0.0  0.0  0.0  
2  0.0  ...   0.0  0.0        0.0  0.0  0.0    0.0   0.0   0.0  0.0  0.0  
3  0.0  ...   0.0  0.0        0.0  0.0  0.0    0.0   0.0   0.0  0.0  0.0  
4  0.0  ...   0.0  0.0        0.0  0.0  0.0    0.0   0.0   0.0  0.0  0.0  

[5 rows x 1000 columns]
