In [1]:
# TF-IDF: Term Frequency / Inverse Document Frequency
# Search Algorithm
# Search engine for Wikipedia using Apache Spark in MLlib
#https://sparkbyexamples.com/pyspark/pyspark-py4j-protocol-py4jerror-org-apache-spark-api-python-pythonutils-jvm/
# -----------------------------------------------------------

from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF


# ----------------------------------------------------------
# PCP 20230328
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# ----------------------------------------------------------



# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf = conf)

# Load documents (one per line).
# rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
rawData = sc.textFile("C:/Users/pcpow/OneDrive/Desktop/DataScience_Udemy_20230321/DataScience/DataScience-Python3/subset-small_PCP_20230329.tsv")


fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

#print("fields: " + str(fields))
#print("documents: " + str(documents))
# print(fields)
# print(documents)



# Store the document names for later:
documentNames = fields.map(lambda x: x[1])


# TF-IDF: Term Frequency / Inverse Document Frequency
# Now hash the words in each document to their term frequencies:
# hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
hashingTF = HashingTF(100)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# At this point we have an RDD of sparse vectors representing each document,
# where each value maps to the term frequency of each unique hash value.

# Let's compute the TF*IDF of each term in each document:
tf.cache()
#idf = IDF(minDocFreq=2).fit(tf)
idf = IDF(minDocFreq=0).fit(tf)
tfidf = idf.transform(tf)
#print(tfidf.take(25))

# Now we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.

# I happen to know that the article for "Abraham Lincoln" is in our data
# set, so let's search for "Gettysburg" (Lincoln gave a famous speech there):

# First, let's figure out what hash value "Gettysburg" maps to by finding the
# index a sparse vector from HashingTF gives us back:
gettysburgTF = hashingTF.transform(["Gettysburg"])
gettysburgHashValue = int(gettysburgTF.indices[0])
# gettysburgHashValue = float(gettysburgTF.indices[0])

# print(gettysburgHashValue.take(100))

## gettysburgHashValue_1 = int(gettysburgTF.indices[1])
## print(gettysburgHashValue_1)

# Now we will extract the TF*IDF score for Gettsyburg's hash value into
# a new RDD for each document:
##gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue])


## ==>> Bug here 20230330
gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue])


# print(gettysburgRelevance.take(100))

# We'll zip in the document names so we can see which is which:
zippedResults = gettysburgRelevance.zip(documentNames)





In [2]:
print(tfidf.take(100))
print(gettysburgTF)
#print(gettysburgRelevance.take(100))
print(gettysburgHashValue)

[SparseVector(100, {0: 9.702, 1: 26.4496, 2: 9.4567, 3: 10.5933, 4: 6.9017, 5: 8.2933, 6: 7.3088, 7: 3.3141, 8: 7.5791, 9: 11.306, 10: 7.9054, 11: 4.3282, 12: 11.5994, 13: 6.2537, 14: 9.862, 15: 12.6868, 16: 8.3257, 17: 35.4431, 18: 11.7205, 19: 13.9894, 20: 10.1626, 21: 8.1171, 22: 5.2417, 23: 6.0519, 24: 7.3529, 25: 10.9022, 26: 5.9844, 27: 10.2501, 28: 7.0235, 29: 7.417, 30: 10.0127, 31: 8.6668, 32: 3.6866, 33: 6.9849, 34: 3.7419, 35: 4.8105, 36: 8.8327, 37: 9.7385, 38: 6.754, 39: 5.651, 40: 13.7589, 41: 12.6254, 42: 8.378, 43: 4.3021, 44: 12.1608, 45: 5.772, 46: 5.4478, 47: 7.1428, 48: 5.4737, 49: 4.2838, 50: 9.679, 51: 6.5747, 52: 18.3127, 53: 5.8559, 54: 4.6435, 55: 9.4665, 56: 6.5001, 57: 4.3301, 58: 17.642, 59: 7.6143, 60: 4.2484, 61: 4.0243, 62: 5.6199, 63: 7.978, 64: 16.9045, 65: 20.7911, 66: 8.9204, 67: 9.8361, 68: 7.1147, 69: 13.9628, 70: 10.0344, 71: 8.2322, 72: 6.9484, 73: 6.9312, 74: 5.5164, 75: 4.8696, 76: 6.7243, 77: 10.5212, 78: 6.5251, 79: 6.5251, 80: 29.8647, 81: 5.

In [3]:
# And, print the document with the maximum TF*IDF value:
print("Best document for Gettysburg is:")
print(zippedResults.max())
# print(zippedResults.take(1000))

Best document for Gettysburg is:
(25.894582062204652, 'Anglicanism')


In [4]:
print(zippedResults)

org.apache.spark.api.java.JavaPairRDD@4f1e1ba4


In [5]:
print(idf)


<pyspark.mllib.feature.IDFModel object at 0x0000020AF032E890>


In [6]:
print(tfidf.take(100))

[SparseVector(100, {0: 9.702, 1: 26.4496, 2: 9.4567, 3: 10.5933, 4: 6.9017, 5: 8.2933, 6: 7.3088, 7: 3.3141, 8: 7.5791, 9: 11.306, 10: 7.9054, 11: 4.3282, 12: 11.5994, 13: 6.2537, 14: 9.862, 15: 12.6868, 16: 8.3257, 17: 35.4431, 18: 11.7205, 19: 13.9894, 20: 10.1626, 21: 8.1171, 22: 5.2417, 23: 6.0519, 24: 7.3529, 25: 10.9022, 26: 5.9844, 27: 10.2501, 28: 7.0235, 29: 7.417, 30: 10.0127, 31: 8.6668, 32: 3.6866, 33: 6.9849, 34: 3.7419, 35: 4.8105, 36: 8.8327, 37: 9.7385, 38: 6.754, 39: 5.651, 40: 13.7589, 41: 12.6254, 42: 8.378, 43: 4.3021, 44: 12.1608, 45: 5.772, 46: 5.4478, 47: 7.1428, 48: 5.4737, 49: 4.2838, 50: 9.679, 51: 6.5747, 52: 18.3127, 53: 5.8559, 54: 4.6435, 55: 9.4665, 56: 6.5001, 57: 4.3301, 58: 17.642, 59: 7.6143, 60: 4.2484, 61: 4.0243, 62: 5.6199, 63: 7.978, 64: 16.9045, 65: 20.7911, 66: 8.9204, 67: 9.8361, 68: 7.1147, 69: 13.9628, 70: 10.0344, 71: 8.2322, 72: 6.9484, 73: 6.9312, 74: 5.5164, 75: 4.8696, 76: 6.7243, 77: 10.5212, 78: 6.5251, 79: 6.5251, 80: 29.8647, 81: 5.

In [8]:
print(gettysburgRelevance.take(100))

[4.248396017138263, 7.282964600808452, 3.034568583670188, 0.9103705751010565, 7.58642145917547, 3.034568583670188, 15.17284291835094, 14.262472343249884, 1.2138274334680752, 1.820741150202113, 0.0, 0.0, 2.4276548669361504, 2.4276548669361504, 7.282964600808452, 0.0, 0.0, 8.193335175909507, 9.103705751010564, 8.193335175909507, 0.3034568583670188, 3.641482300404226, 10.924446901212677, 0.0, 0.3034568583670188, 0.0, 3.034568583670188, 0.0, 0.3034568583670188, 4.855309733872301, 2.7311117253031694, 0.0, 1.820741150202113, 1.820741150202113, 1.2138274334680752, 0.3034568583670188, 4.855309733872301, 1.2138274334680752, 6.069137167340376, 0.0, 5.765680308973357, 2.4276548669361504, 0.0, 0.0, 1.820741150202113, 0.9103705751010565, 5.462223450606339, 2.1241980085691314, 0.0, 0.0, 0.0, 0.0, 0.9103705751010565, 2.1241980085691314, 0.6069137167340376, 2.4276548669361504, 0.6069137167340376, 0.0, 4.551852875505282, 9.103705751010564, 1.820741150202113, 2.4276548669361504, 0.0, 0.3034568583670188,

In [6]:
print(documentNames.take(100))

['Anarchism', 'Autism', 'Albedo', 'A', 'Alabama', 'Achilles', 'Abraham Lincoln', 'Aristotle', 'An American in Paris', 'Academy Award', 'Actrius', 'Animalia (book)', 'International Atomic Time', 'Altruism', 'Ayn Rand', 'Alain Connes', 'Allan Dwan', 'Algeria', 'List of characters in Atlas Shrugged', 'Anthropology', 'Agricultural science', 'Alchemy', 'Austria', 'Alien', 'Astronomer', 'Amoeboid', 'ASCII', 'Austin (disambiguation)', 'Animation', 'Apollo', 'Andre Agassi', 'Austro-Asiatic languages', 'Afro-Asiatic languages', 'Andorra', 'Arithmetic mean', 'American Football Conference', 'Animal Farm', 'Amphibian', 'Alaska', 'Architecture (disambiguation)', 'Agriculture', 'Aldous Huxley', 'Ada', 'Aberdeen (disambiguation)', 'Algae', 'Analysis of variance', 'Alkane', 'Appeal', 'Answer', 'Appellate court', 'Arraignment', 'America the Beautiful', 'Assistive technology', 'Abacus', 'Acid', 'Asphalt', 'American National Standards Institute', 'Argument (disambiguation)', 'Apollo 11', 'Apollo 8', 'Ast

In [9]:
print(tf.take(100))

[SparseVector(100, {0: 74.0, 1: 200.0, 2: 48.0, 3: 91.0, 4: 29.0, 5: 37.0, 6: 29.0, 7: 14.0, 8: 38.0, 9: 56.0, 10: 48.0, 11: 17.0, 12: 49.0, 13: 26.0, 14: 45.0, 15: 64.0, 16: 42.0, 17: 344.0, 18: 52.0, 19: 82.0, 20: 48.0, 21: 50.0, 22: 23.0, 23: 27.0, 24: 39.0, 25: 54.0, 26: 27.0, 27: 67.0, 28: 35.0, 29: 31.0, 30: 49.0, 31: 40.0, 32: 16.0, 33: 28.0, 34: 15.0, 35: 20.0, 36: 41.0, 37: 51.0, 38: 26.0, 39: 24.0, 40: 79.0, 41: 72.0, 42: 41.0, 43: 16.0, 44: 49.0, 45: 22.0, 46: 25.0, 47: 31.0, 48: 23.0, 49: 18.0, 50: 39.0, 51: 30.0, 52: 146.0, 53: 23.0, 54: 23.0, 55: 42.0, 56: 30.0, 57: 19.0, 58: 70.0, 59: 31.0, 60: 14.0, 61: 17.0, 62: 30.0, 63: 41.0, 64: 75.0, 65: 138.0, 66: 34.0, 67: 42.0, 68: 34.0, 69: 76.0, 70: 80.0, 71: 33.0, 72: 31.0, 73: 37.0, 74: 22.0, 75: 23.0, 76: 30.0, 77: 60.0, 78: 31.0, 79: 31.0, 80: 247.0, 81: 26.0, 82: 42.0, 83: 43.0, 84: 26.0, 85: 41.0, 86: 29.0, 87: 35.0, 88: 21.0, 89: 130.0, 90: 12.0, 91: 34.0, 92: 19.0, 93: 36.0, 94: 30.0, 95: 49.0, 96: 37.0, 97: 48.0, 98: 