# Pyspark using the documentation 

In [None]:
# PySpark
!pip install pyspark

In [3]:
import pyspark
from pyspark.sql import SparkSession 

In [None]:
spark = SparkSession.builder.master("local").appName("myapp").getOrCreate()

In [5]:
textFile = spark.read.text("README.md")
textFile.count()

                                                                                

2

24/05/06 18:55:07 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
textFile.first()

Row(value='# Code_practice')

In [7]:
linesWithdirty = textFile.filter(textFile.value.contains("dirty"))
linesWithdirty.count()
linesWithdirty.show()
linesWithdirty.collect()

+--------------------+
|               value|
+--------------------+
|More like a rough...|
+--------------------+



[Row(value='More like a rough notebook for all my codes. I want them to be as dirty as possible. Aim is to make lot of mistakes here and learn from them. ')]

In [18]:
textFile.filter(textFile.value.contains("dirty")).count()

1

In [12]:
## let's count a new word 
linesWithmistakes = textFile.filter(textFile.value.contains("mistake"))
linesWithmistakes.count()
linesWithmistakes.collect()
linesWithmistakes.show()

+--------------------+
|               value|
+--------------------+
|More like a rough...|
+--------------------+



## More on Dataset Operations 

In [18]:
from pyspark.sql.functions import * 
textFile.select(size(split(textFile.value, "\s+")).name("numWords")).agg(max(col("numWords"))).collect()

[Row(max(numWords)=31)]

In [22]:
wordCounts = textFile.select(explode(split(textFile.value, "\s+")).alias("word")).groupBy("word").count()
wordCounts.collect()

[Row(word='possible.', count=1),
 Row(word='for', count=1),
 Row(word='be', count=1),
 Row(word='is', count=1),
 Row(word='dirty', count=1),
 Row(word='mistakes', count=1),
 Row(word='want', count=1),
 Row(word='Aim', count=1),
 Row(word='my', count=1),
 Row(word='from', count=1),
 Row(word='More', count=1),
 Row(word='them.', count=1),
 Row(word='them', count=1),
 Row(word='make', count=1),
 Row(word='like', count=1),
 Row(word='and', count=1),
 Row(word='lot', count=1),
 Row(word='here', count=1),
 Row(word='of', count=1),
 Row(word='Code_practice', count=1),
 Row(word='#', count=1),
 Row(word='I', count=1),
 Row(word='codes.', count=1),
 Row(word='all', count=1),
 Row(word='a', count=1),
 Row(word='as', count=2),
 Row(word='', count=1),
 Row(word='learn', count=1),
 Row(word='notebook', count=1),
 Row(word='to', count=2),
 Row(word='rough', count=1)]

In [25]:
linesWithdirty.cache()
linesWithdirty.count()

24/05/06 19:05:51 WARN CacheManager: Asked to cache already cached data.


1

## Self contained applications 

In [26]:
install_requires= [
    'pyspark=={site.SPARK_VERSION}'
]

In [36]:
"""SimpleApp.py"""
from pyspark.sql import SparkSession 

logFile = "README.md"
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()
logData = spark.read.text(logFile).cache()

numAs = logData.filter(logData.value.contains('a')).count()
numBs = logData.filter(logData.value.contains("b")).count()

print("Lines with a: %i, lines with b: %i" % (numAs, numBs))

spark.stop()

Lines with a: 2, lines with b: 1


## PyArrow

In [59]:
!pip install pyarrow --upgrade
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
print(spark.sparkContext.getConf().getAll())
spark.sparkContext.setLogLevel("DEBUG")

[('spark.master', 'local'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'), ('spark.app.submitTime', '1715021690685'), ('spark.driver.port', '38479'), ('s

In [49]:
from pyspark.sql import SparkSession

# Create or retrieve a Spark session
spark = SparkSession.builder \
    .appName("Pandas to Spark Example") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()


24/05/06 19:28:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [51]:
import numpy as np 
import pandas as pd 

# Enable Arrow-based columnar data transfers 
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# Generate Pandas DataFrame 
pdf = pd.DataFrame(np.random.rand(100,3))

# Create a Spark DataFrame from a Pandas DataFrame using Arrow 
df = spark.createDataFrame(pdf)

In [None]:
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow 
result_pdf = df.select("*").toPandas()

# Stop the Spark session
spark.stop()

###???? why errors?????????
### debugging failed here 