In [1]:
# TF-IDF: Term Frequency / Inverse Document Frequency
# Search Algorithm
# Search engine for Wikipedia using Apache Spark in MLlib
# -----------------------------------------------------------

from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF


# ----------------------------------------------------------
# PCP 20230328
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# ----------------------------------------------------------



# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)

# Load documents (one per line).
# rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
rawData = sc.textFile("C:/Users/pcpow/OneDrive/Desktop/DataScience_Udemy_20230321/DataScience/DataScience-Python3/subset-small_PCP_20230329.tsv")


fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

#print("fields: " + str(fields))
#print("documents: " + str(documents))
# print(fields)
# print(documents)



# Store the document names for later:
documentNames = fields.map(lambda x: x[1])


# TF-IDF: Term Frequency / Inverse Document Frequency
# Now hash the words in each document to their term frequencies:
# hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
hashingTF = HashingTF(1000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
#idf = IDF(minDocFreq=2).fit(tf)
idf = IDF(minDocFreq=0).fit(tf)
tfidf = idf.transform(tf)
#print(tfidf.take(25))

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.

# I happen to know that the article for "Abraham Lincoln" is in our data
# set, so let's search for "Gettysburg" (Lincoln gave a famous speech there):

# First, let's figure out what hash value "Gettysburg" maps to by finding the
# index a sparse vector from HashingTF gives us back:
gettysburgTF = hashingTF.transform(["Gettysburg"])
gettysburgHashValue = int(gettysburgTF.indices[0])
# gettysburgHashValue = float(gettysburgTF.indices[0])

# print(gettysburgHashValue.take(100))

## gettysburgHashValue_1 = int(gettysburgTF.indices[1])
## print(gettysburgHashValue_1)

# Now we will extract the TF*IDF score for Gettsyburg's hash value into
# a new RDD for each document:
##gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue])

gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue])

# print(gettysburgRelevance.take(100))

# We'll zip in the document names so we can see which is which:
zippedResults = gettysburgRelevance.zip(documentNames)





In [2]:
print(tfidf.take(100))
print(gettysburgTF)
#print(gettysburgRelevance.take(100))
print(gettysburgHashValue)

[SparseVector(1000, {1: 3.4781, 3: 5.6963, 4: 5.3849, 5: 2.9535, 6: 1.3442, 7: 3.5412, 8: 2.0846, 9: 4.5331, 10: 2.39, 11: 3.3564, 12: 23.8374, 13: 1.8801, 14: 6.1129, 15: 3.7296, 16: 4.8389, 17: 4.6752, 19: 2.851, 20: 4.3306, 21: 3.3564, 22: 4.4874, 23: 2.0789, 24: 1.0739, 26: 2.8743, 27: 2.9986, 28: 9.5551, 29: 4.2132, 30: 10.1741, 31: 4.2739, 32: 3.5219, 33: 5.6121, 34: 2.9058, 35: 1.8852, 36: 3.9167, 37: 3.4012, 38: 8.4985, 39: 2.4916, 40: 2.1738, 41: 5.017, 42: 2.4233, 44: 7.8189, 45: 2.0509, 46: 1.3597, 47: 2.4052, 50: 2.43, 51: 3.7807, 52: 13.2447, 53: 1.1626, 54: 4.9359, 55: 3.1956, 56: 1.795, 57: 1.3292, 59: 4.7933, 60: 2.8125, 61: 4.6875, 62: 1.953, 63: 0.8614, 64: 1.9879, 65: 1.0007, 66: 2.6808, 67: 1.0451, 69: 5.5029, 70: 2.1016, 71: 3.8013, 72: 6.2726, 73: 0.9299, 74: 2.6998, 76: 4.3192, 78: 3.768, 80: 26.8504, 81: 2.4916, 82: 2.536, 83: 1.7532, 84: 2.8785, 85: 1.2083, 86: 8.7094, 88: 1.0769, 89: 4.8641, 90: 1.737, 93: 4.7496, 94: 9.3717, 96: 1.0089, 97: 6.4036, 98: 1.8648

In [3]:
# And, print the document with the maximum TF*IDF value:
print("Best document for Gettysburg is:")
print(zippedResults.max())
print(zippedResults.take(1000))

Best document for Gettysburg is:
(63.00638722611045, 'Antidepressant')
[(1.2116612928098163, 'Anarchism'), (1.2116612928098163, 'Autism'), (0.0, 'Albedo'), (3.634983878429449, 'A'), (1.2116612928098163, 'Alabama'), (0.0, 'Achilles'), (3.634983878429449, 'Abraham Lincoln'), (0.0, 'Aristotle'), (1.2116612928098163, 'An American in Paris'), (0.0, 'Academy Award'), (0.0, 'Actrius'), (0.0, 'Animalia (book)'), (1.2116612928098163, 'International Atomic Time'), (0.0, 'Altruism'), (2.4233225856196325, 'Ayn Rand'), (0.0, 'Alain Connes'), (1.2116612928098163, 'Allan Dwan'), (1.2116612928098163, 'Algeria'), (1.2116612928098163, 'List of characters in Atlas Shrugged'), (2.4233225856196325, 'Anthropology'), (0.0, 'Agricultural science'), (1.2116612928098163, 'Alchemy'), (0.0, 'Austria'), (0.0, 'Alien'), (0.0, 'Astronomer'), (0.0, 'Amoeboid'), (0.0, 'ASCII'), (0.0, 'Austin (disambiguation)'), (2.4233225856196325, 'Animation'), (2.4233225856196325, 'Apollo'), (10.904951635288347, 'Andre Agassi'), (0.

In [5]:
print(zippedResults)

org.apache.spark.api.java.JavaPairRDD@35cf2490
