# TF-IDF Embedding Implementation
### Vectorize the pre-processed data into PySpark dataframe using TF-IDF.
Libraries: Scikit-learn, PySpark

Author: Marcus KWAN TH

Last updated: 2025-11-14

In [None]:
import sys, os

# Prevent PySpark from using a different Python interpreter
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Add the root folder to sys.path before importing custom package
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
if root_path not in sys.path:
    sys.path.append(root_path)

from util.preprocessing import load_and_preprocess_data

In [None]:
# Import all necessary library
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

In [3]:
# Initialize Spark Session
ss  = SparkSession.builder \
        .appName("Marcus TF-IDF") \
        .getOrCreate()

# Add util.zip to PySpark context (need to build the util.zip first!)
spark = ss.sparkContext.addPyFile("../util.zip")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/14 15:20:44 WARN Utils: Your hostname, Marcuss-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.11.98.188 instead (on interface en0)
25/11/14 15:20:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/14 15:20:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/14 15:20:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# Load and preprocess training and testing data from the custom package
train_df = load_and_preprocess_data('../Twitter_data/traindata7.csv')
test_df = load_and_preprocess_data('../Twitter_data/testdata7.csv')

25/11/14 15:20:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [5]:
# Convert Spark DataFrames to Pandas for TF-IDF processing
train_pandas = train_df.toPandas()
test_pandas = test_df.toPandas()

                                                                                

In [6]:
# Extract documents and labels from training data
train_documents = train_pandas.iloc[:, 0].astype(str).tolist()
train_labels = train_pandas.iloc[:, 1].tolist()

In [7]:
# Extract documents and labels from testing data
test_documents = test_pandas.iloc[:, 0].astype(str).tolist()
test_labels = test_pandas.iloc[:, 1].tolist()

print(f"Training samples: {len(train_documents)}")
print(f"Testing samples: {len(test_documents)}")

Training samples: 596
Testing samples: 397


In [8]:
# === Apply TF-IDF Vectorizer - fit on training data only ===
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', min_df=2, max_df=0.95)
train_tfidf_matrix = vectorizer.fit_transform(train_documents)
test_tfidf_matrix = vectorizer.transform(test_documents)

In [9]:
# Convert sparse matrices to dense arrays
train_tfidf_dense = train_tfidf_matrix.toarray()
test_tfidf_dense = test_tfidf_matrix.toarray()

print(f"TF-IDF matrix shape - Train: {train_tfidf_dense.shape}, Test: {test_tfidf_dense.shape}")
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

TF-IDF matrix shape - Train: (596, 862), Test: (397, 862)
Vocabulary size: 862


In [10]:
# === Create PySpark DataFrames ===
train_spark_df = ss.createDataFrame(
    [(Vectors.dense(vec), int(lbl)) for vec, lbl in zip(train_tfidf_dense, train_labels)],
    ["tfidf_vector", "label"]
)

test_spark_df = ss.createDataFrame(
    [(Vectors.dense(vec), int(lbl)) for vec, lbl in zip(test_tfidf_dense, test_labels)],
    ["tfidf_vector", "label"]
)

In [11]:
print("Training DataFrame with TF-IDF vectors:")
train_spark_df.tail(3)
print("\nTesting DataFrame with TF-IDF vectors:")
test_spark_df.tail(3)

Training DataFrame with TF-IDF vectors:

Testing DataFrame with TF-IDF vectors:


[Row(tfidf_vector=DenseVector([0.0, 0.0, 0.0, 0.5228, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0