In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
spark.conf.set('spark.sql.repl.enabled', True)
df = spark.createDataFrame([['a', 13, "ucd"],['b', 15, "ucb"], ['c', 19,"uccs"]],schema=['name', 'age', 'university'])

In [3]:
df.cache()

df = df.repartition(10)

In [4]:
df.show()

+----+---+----------+
|name|age|university|
+----+---+----------+
|   b| 15|       ucb|
|   c| 19|      uccs|
|   a| 13|       ucd|
+----+---+----------+



In [5]:
df.groupBy('name').avg().show()

+----+--------+
|name|avg(age)|
+----+--------+
|   b|    15.0|
|   c|    19.0|
|   a|    13.0|
+----+--------+



In [6]:
df.select(df.name).show()

+----+
|name|
+----+
|   b|
|   c|
|   a|
+----+



In [7]:
from pyspark.sql.functions import upper
df.withColumn('upper_name', upper(df.name)).show()

+----+---+----------+----------+
|name|age|university|upper_name|
+----+---+----------+----------+
|   b| 15|       ucb|         B|
|   c| 19|      uccs|         C|
|   a| 13|       ucd|         A|
+----+---+----------+----------+



In [8]:
df = spark.read.text('teenagers.txt')

In [9]:
df.show()

+----------------+
|           value|
+----------------+
|   a,13,abnjvjdv|
|     b,14,jvvhsf|
|     c,1,jdvjkhv|
|    d,15,jvhggir|
|    e,24,hhfrwnf|
|  f,18,iueairier|
|g,10,iwruiwejrsf|
+----------------+



In [10]:
df.filter(df.value.contains('j')).count()

5

In [12]:
spark.stop()

In [1]:
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("test").setMaster('local')
sc = SparkContext(conf=conf)



In [2]:
from pyspark.sql import SQLContext
sql_sc = SQLContext(sc)
df1 = sc.textFile('teenagers.txt')
df1.collect()



['a,13,abnjvjdv',
 'b,14,jvvhsf',
 'c,1,jdvjkhv',
 'd,15,jvhggir',
 'e,24,hhfrwnf',
 'f,18,iueairier',
 'g,10,iwruiwejrsf']

In [3]:
df_rdd = df1.map(lambda x:(x,)).toDF(['value'])

In [4]:
df_rdd.filter(df_rdd.value.contains('1')).show()

+----------------+
|           value|
+----------------+
|   a,13,abnjvjdv|
|     b,14,jvvhsf|
|     c,1,jdvjkhv|
|    d,15,jvhggir|
|  f,18,iueairier|
|g,10,iwruiwejrsf|
+----------------+



In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
val = spark.sparkContext.parallelize(range(30))

In [3]:
from itertools import combinations
combinationsDF = list(combinations(val.collect(), 3))

data1 = [(f"{sorted(row)[0]}, {sorted(row)[1]}, {sorted(row)[2]}", list(sorted(row))) for row in combinationsDF]
df2 = spark.createDataFrame(data1, ["tripletID", "triplet"])
df2.show()

+---------+----------+
|tripletID|   triplet|
+---------+----------+
|  0, 1, 2| [0, 1, 2]|
|  0, 1, 3| [0, 1, 3]|
|  0, 1, 4| [0, 1, 4]|
|  0, 1, 5| [0, 1, 5]|
|  0, 1, 6| [0, 1, 6]|
|  0, 1, 7| [0, 1, 7]|
|  0, 1, 8| [0, 1, 8]|
|  0, 1, 9| [0, 1, 9]|
| 0, 1, 10|[0, 1, 10]|
| 0, 1, 11|[0, 1, 11]|
| 0, 1, 12|[0, 1, 12]|
| 0, 1, 13|[0, 1, 13]|
| 0, 1, 14|[0, 1, 14]|
| 0, 1, 15|[0, 1, 15]|
| 0, 1, 16|[0, 1, 16]|
| 0, 1, 17|[0, 1, 17]|
| 0, 1, 18|[0, 1, 18]|
| 0, 1, 19|[0, 1, 19]|
| 0, 1, 20|[0, 1, 20]|
| 0, 1, 21|[0, 1, 21]|
+---------+----------+
only showing top 20 rows



In [23]:
from pyspark.sql import functions as sf
path = "C:\spark\spark-3.5.1-bin-hadoop3\README.md"
text = spark.read.text(path)
text.show(truncate=False)
text.select(sf.size(sf.split(text.value, "\s+")).name("length of words")).agg(sf.max(sf.col('length of words'))).show(truncate=False)


+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                             |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|# Apache Spark                                                                                                                                                                                                    |
|                                                                                                                                                   

In [22]:
text.select(sf.explode(sf.split(text.value, '\s+'))).show(truncate=False)

+-----------+
|col        |
+-----------+
|#          |
|Apache     |
|Spark      |
|           |
|Spark      |
|is         |
|a          |
|unified    |
|analytics  |
|engine     |
|for        |
|large-scale|
|data       |
|processing.|
|It         |
|provides   |
|high-level |
|APIs       |
|in         |
|Scala,     |
+-----------+
only showing top 20 rows



In [33]:
rdd = spark.sparkContext.parallelize(range(1,4)).map(lambda s:(s, 'a'*s))
rdd.collect()

[(1, 'a'), (2, 'aa'), (3, 'aaa')]

In [36]:
rdd.saveAsTextFile("C:/Users/vrjav/Downloads/pyspark/test")

In [37]:
rdd1 = spark.sparkContext.parallelize(range(1,10))
counter = 0
def inc(data):
    global counter
    counter += data
rdd1.foreach(inc)
print(counter)

0


In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('123').setMaster('local')
sc = SparkContext(conf=conf)
accum = sc.accumulator(0)
rdd2 = sc.parallelize(range(1,10))
def inc1(data):
    accum.add(data)
rdd2.foreach(inc1)
print(accum.value)

45
