In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from scipy import sparse

In [2]:
# Load train and test data
train_df = pd.read_csv(r"./cleaned_train.csv")
test_df = pd.read_csv(r"./cleaned_test.csv")


In [3]:
# Fill missing text values
train_corpus = train_df['text_clean'].fillna("")
test_corpus = test_df['text_clean'].fillna("")

# Train TF-IDF on training corpus
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_corpus)
test_features = vectorizer.transform(test_corpus)

In [4]:
# Preview
print(f"Number of training documents: {train_features.shape[0]}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print("Sample vocab:", list(vectorizer.vocabulary_.keys())[:10])

Number of training documents: 7613
Vocabulary size: 15776
Sample vocab: ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah']


In [5]:
# Save vectorizer
vectorizer_path = "tfidf_vectorizer.pkl"
joblib.dump(vectorizer, vectorizer_path)
print(f"Vectorizer saved to: {os.path.abspath(vectorizer_path)}")

Vectorizer saved to: /home/jovyan/Documents/cs549/tweet-classifier/datasets/tfidf_vectorizer.pkl


In [6]:
sparse.save_npz("tfidf_train_features.npz", train_features)
sparse.save_npz("tfidf_test_features.npz", test_features)
train_df[['target']].to_csv("tfidf_train_labels.csv", index=False)
test_df.to_csv("tfidf_test_labels.csv", index=False)  # test set has no target so save entire dataframe