In [None]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Sample DataFrame
data = {
    'title': [
        'Example title one!',
        'Another title here?',
        'More about titles...',
        'Last title example.'
    ]
}
df = pd.DataFrame(data)

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # Join words back to string
    text = ' '.join(words)
    return text

# Apply preprocessing to the 'title' column
df['processed_title'] = df['title'].apply(preprocess_text)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'processed_title' column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_title'])

# Converting the matrix to a list of arrays (one array per document)
df['tfidf_vector'] = list(tfidf_matrix.toarray())

# Display the updated DataFrame
print(df)
