### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [None]:

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')
nltk.download('punkt')
data={
    'label':['ham','spam','ham','ham','spam'],
    'text':[
        'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Then so on...',
        'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C\'s apply 08452810075over18\'s',
        'Ok lar... Joking wif u oni...',
        'U dun say so early hor... U c already then say...',
        'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.'
    ]
}
df=pd.DataFrame(data)
ps=PorterStemmer()
corpus=[]
for i in range(len(df)):
    review=re.sub('[^a-zA-Z]',' ',df['text'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word)for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)
tfidf=TfidfVectorizer(max_features=100) #Reduced max_features for small simulated data
X=tfidf.fit_transform(corpus).toarray()
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values
print("OriginalDataHead:")
print(df.head())
print("\nProcessedText(first5entriesinCorpus):")
for i in range(5):
    print(corpus[i])
print("\nShapeofTF-IDFfeatures(X):",X.shape)
print("\nShapeofTarget(y):",y.shape)
print("\nFirstrowofTF-IDFfeatures(X[0]):\n",X[0][:5])