In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window as W
from pyspark.conf import SparkConf
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import pandas as pd
from pyspark.mllib.feature import HashingTF, IDF
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window as W
from pyspark.conf import SparkConf

In [2]:
conf= pyspark.SparkConf()
conf.setMaster("k8s://https://"+os.environ['KUBERNETES_SERVICE_HOST'])
 
# For hdfs configuration
conf.set("spark.hadoop.fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem")
conf.set("spark.hadoop.fs.hdfs.server", "org.apache.hadoop.hdfs.server.namenode.NameNode")
conf.set("spark.hadoop.conf", "org.apache.hadoop.hdfs.HdfsConfiguration")
conf.set("spark.hadoop.dfs.nameservices", "nb")
conf.set("spark.hadoop.dfs.ha.namenodes.nb", "h1,h2")
conf.set("spark.hadoop.dfs.namenode.rpc-address.nb.h1", "h3001.ali-netbase.com:9820")
conf.set("spark.hadoop.dfs.namenode.rpc-address.nb.h2", "h3101.ali-netbase.com:9820")
conf.set("spark.hadoop.dfs.client.failover.proxy.provider.nb", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider")
conf.set("spark.hadoop.fs.defaultFS", "hdfs://nb:9820")
 
# For kubernetes configration

f_name = "big_data.csv"
executor_cores="16"
executor_machine_memory="32"
executor_heap_memory="24"
executor_instances="1"
driver_cores="16"
driver_memory="128"
 
# request on-demand machines
conf.set("spark.kubernetes.executor.label.eci", "true")
conf.set("spark.kubernetes.executor.annotation.k8s.aliyun.com/eci-use-specs", "{}-{}Gi".format(executor_cores, executor_machine_memory))
  
# machine numbers
conf.set("spark.executor.instances", executor_instances)
conf.set("spark.kubernetes.allocation.batch.size", executor_instances)
 
  
# machine cpu numbers
conf.set("spark.kubernetes.executor.request.cores", executor_cores)
conf.set("spark.kubernetes.executor.limit.cores", executor_cores)
conf.set("spark.executor.memory", executor_heap_memory+"g")
conf.set("spark.executor.cores", executor_cores)
conf.set("spark.driver.memory", driver_memory+"g")
conf.set("spark.driver.cores", driver_cores)
  
# spark version
# conf.set("spark.kubernetes.container.image", "maven-docker.netbase.com/spark-py:v2.4.6")
# conf.set("spark.kubernetes.container.image", "docker-registry.netbase.com/de/base-images/spark-py:1.2.0-spark3.1.1")
conf.set("spark.kubernetes.container.image", "docker-registry.netbase.com/de/base-images/spark-py:1.2.0-spark3.1.1-test")
conf.set("spark.kubernetes.container.image.pullPolicy", "Always")
  
conf.set("spark.kubernetes.namespace", "jupyterhub")
conf.set("spark.driver.host", os.environ['HOSTIP'])
conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
conf.set('spark.submit.deployMode', 'client')
conf.set('spark.kubernetes.pyspark.pythonVersion', "3")
# tell spark executor act on behalf to user woot
conf.set('spark.kubernetes.executorEnv.HADOOP_USER_NAME', "woot")
conf.set('spark.executorEnv.HADOOP_USER_NAME', "woot")
 
conf.set('spark.default.parallelism', '32')
 
conf.setAppName( os.environ['JUPYTERHUB_CLIENT_ID'])
spark = SparkSession.builder.config(conf=conf).getOrCreate()
   
spark.sparkContext.getConf().getAll()

22/02/17 07:06:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


[('spark.driver.port', '34837'),
 ('spark.kubernetes.executor.request.cores', '16'),
 ('spark.kubernetes.executor.annotation.k8s.aliyun.com/eci-use-specs',
  '16-32Gi'),
 ('spark.kubernetes.authenticate.driver.serviceAccountName', 'spark'),
 ('spark.master', 'k8s://https://10.122.0.1'),
 ('spark.kubernetes.container.image',
  'docker-registry.netbase.com/de/base-images/spark-py:1.2.0-spark3.1.1-test'),
 ('spark.hadoop.dfs.namenode.rpc-address.nb.h1', 'h3001.ali-netbase.com:9820'),
 ('spark.kubernetes.allocation.batch.size', '1'),
 ('spark.kubernetes.executor.label.eci', 'true'),
 ('spark.kubernetes.pyspark.pythonVersion', '3'),
 ('spark.kubernetes.executor.podNamePrefix',
  'jupyterhub-user-ali40netbase-com-bc97137f0680a60c'),
 ('spark.hadoop.fs.hdfs.impl', 'org.apache.hadoop.hdfs.DistributedFileSystem'),
 ('spark.app.startTime', '1645081567489'),
 ('spark.sql.warehouse.dir', 'file:/home/jovyan/megaopus/spark-warehouse'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.default

In [3]:
path = 'hdfs://nb/ai-pipeline/megaopus_data/{}'.format(f_name)
_df = spark.read.options(header=True, encoding="UTF-8").csv(path)
_df = _df[_df['Sound Bite Text'].isNotNull()]
_df = _df.select(F.split(_df['Sound Bite Text'], " ").alias('word_array'))
_df.show()

                                                                                

+--------------------+
|          word_array|
+--------------------+
|[#ESG, News:, Som...|
|[@PippaStevens13,...|
|[@AtlantaLiberal,...|
|[IR, Trending, th...|
|[@MikeBloomberg, ...|
|[With, increased,...|
|[@GeorgeSerafeim,...|
|[Miners, using, n...|
|[With, increased,...|
|[With, increased,...|
|["Fink, says, @bl...|
|[Miners, using, n...|
|[50, global, busi...|
|[With, increased,...|
|[Interested, in, ...|
|[Mining, News:, M...|
|["Enviro, Equity,...|
|[We, need, this, ...|
|[Top, #Compliance...|
|[With, increased,...|
+--------------------+
only showing top 20 rows



In [4]:
_df.printSchema()

root
 |-- word_array: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [5]:
hashingTF = pyspark.ml.feature.HashingTF(inputCol="word_array", outputCol="features")
hashingTF.setNumFeatures(100000)
tf = hashingTF.transform(_df)
tf.head().features

                                                                                

SparseVector(100000, {585: 1.0, 4705: 1.0, 5145: 1.0, 7871: 1.0, 10266: 1.0, 13956: 1.0, 16017: 3.0, 21318: 1.0, 25209: 1.0, 26365: 1.0, 26467: 1.0, 30488: 1.0, 30578: 1.0, 31833: 1.0, 35578: 1.0, 44658: 1.0, 48540: 1.0, 51152: 1.0, 51299: 1.0, 52276: 1.0, 52671: 1.0, 55867: 1.0, 56721: 1.0, 59189: 1.0, 60364: 1.0, 62891: 1.0, 65581: 2.0, 69218: 1.0, 71266: 1.0, 75882: 1.0, 81991: 1.0, 85726: 1.0, 85870: 1.0, 90073: 1.0, 95553: 1.0, 99737: 1.0})

In [6]:
tf.cache()
idf_model = pyspark.ml.feature.IDF().setInputCol("features").setOutputCol("idf").fit(tf)
tf_idf = idf_model.transform(tf)
tf_idf.printSchema()

[Stage 4:>                                                          (0 + 5) / 5]

root
 |-- word_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- idf: vector (nullable = true)



                                                                                

In [None]:
from datetime import datetime
ta = datetime.now()
kmeans = KMeans(featuresCol='idf', k=100).setSeed(1)
tb = datetime.now()
model = kmeans.fit(tf_idf)
tc = datetime.now()
predictions = model.transform(tf_idf)
td = datetime.now()
print('elapsed time:', td - ta)

22/02/17 07:07:09 WARN DAGScheduler: Broadcasting large task binary with size 1613.4 KiB
22/02/17 07:07:10 WARN DAGScheduler: Broadcasting large task binary with size 1614.1 KiB
22/02/17 07:07:10 WARN DAGScheduler: Broadcasting large task binary with size 1614.7 KiB
22/02/17 07:07:11 WARN DAGScheduler: Broadcasting large task binary with size 1615.1 KiB
22/02/17 07:07:11 WARN DAGScheduler: Broadcasting large task binary with size 1615.1 KiB
22/02/17 07:07:20 WARN DAGScheduler: Broadcasting large task binary with size 1615.4 KiB
22/02/17 07:07:21 WARN DAGScheduler: Broadcasting large task binary with size 1616.0 KiB
22/02/17 07:07:47 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/02/17 07:07:47 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/02/17 07:07:59 WARN DAGScheduler: Broadcasting large task binary with size 1615.3 KiB
22/02/17 07:08:04 WARN DAGScheduler: Broadcasting large task binary with size

In [None]:
predictions.show()

In [None]:
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='features', \
        metricName='silhouette', distanceMeasure='squaredEuclidean')
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

In [None]:
predictions.groupBy('prediction').\
        count().\
        sort(F.desc('count')).\
        show(truncate=False)

In [None]:
spark.stop()