In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


In [None]:
spark = SparkSession.builder \
    .appName("TF-IDF Example") \
    .getOrCreate()

In [None]:
!pip install pyspark



In [None]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .load(

In [None]:
df = df.na.drop(subset=["reviews_text"])

In [None]:
tokenizer = Tokenizer(inputCol="reviews_text", outputCol="words")
tokenized = tokenizer.transform(df)

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
tf = hashingTF.transform(tokenized)

In [None]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf)
tfidf = idfModel.transform(tf)

In [None]:
df.printSchema()


root
 |-- id: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- dateAdded: string (nullable = true)
 |-- dateUpdated: string (nullable = true)
 |-- ean: string (nullable = true)
 |-- keys: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- manufacturerNumber: string (nullable = true)
 |-- name: string (nullable = true)
 |-- reviews.date: string (nullable = true)
 |-- reviews.dateAdded: string (nullable = true)
 |-- reviews.dateSeen: string (nullable = true)
 |-- reviews.didPurchase: string (nullable = true)
 |-- reviews.doRecommend: string (nullable = true)
 |-- reviews.id: string (nullable = true)
 |-- reviews.numHelpful: string (nullable = true)
 |-- reviews.rating: string (nullable = true)
 |-- reviews.sourceURLs: string (nullable = true)
 |-- reviews.text: string (nullable = true)
 |-- reviews.title: string (nullable = true)
 |-- reviews.userCity: string (nullable = true)
 |-- reviews.userProvince: st

In [None]:
tfidf.select("id", "features").show()

+--------------------+--------------------+
|                  id|            features|
+--------------------+--------------------+
|AV13O1A8GV-KLJ3akUyj|(10000,[163,307,4...|
|AV14LG0R-jtxr-f38QfS|(10000,[2752,4495...|
|AV14LG0R-jtxr-f38QfS|(10000,[2752,6168...|
|AV16khLE-jtxr-f38VFn|(10000,[198,307,4...|
|AV16khLE-jtxr-f38VFn|(10000,[80,1345,1...|
|AV16khLE-jtxr-f38VFn|(10000,[55,431,48...|
|AV16khLE-jtxr-f38VFn|(10000,[274,307,4...|
|AV16khLE-jtxr-f38VFn|(10000,[447,488,8...|
|AV16khLE-jtxr-f38VFn|(10000,[1226,1263...|
|AV16khLE-jtxr-f38VFn|(10000,[307,495,6...|
|AV16khLE-jtxr-f38VFn|(10000,[80,387,48...|
|AV16khLE-jtxr-f38VFn|(10000,[25,775,12...|
|AV16khLE-jtxr-f38VFn|(10000,[431,488,6...|
|AV16khLE-jtxr-f38VFn|(10000,[740,1661,...|
|AV16khLE-jtxr-f38VFn|(10000,[488,613,1...|
|AV16khLE-jtxr-f38VFn|(10000,[447,1588,...|
|AV16khLE-jtxr-f38VFn|(10000,[80,447,48...|
|AV16khLE-jtxr-f38VFn|(10000,[488,1756,...|
|AV16khLE-jtxr-f38VFn|(10000,[157,431,4...|
|AV16khLE-jtxr-f38VFn|(10000,[96

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
tfidf_pd_df = tfidf.select("id", "features").toPandas()

# Save Pandas DataFrame to CSV file
tfidf_pd_df.to_csv("/content/drive/MyDrive/Colab Notebooks/tfidf_output.csv", index=False)