# Square of integers in a dataframe

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('Squares').getOrCreate()
data = [(1, ), (2, ), (3, ), (4, ), (5, )]

# note createDataFrame
df = spark.createDataFrame(data, schema = ['no'])

ndf = df.withColumn('ans', col('no') ** 2)

ndf.show()
spark.stop()

24/10/02 18:07:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+---+----+
| no| ans|
+---+----+
|  1| 1.0|
|  2| 4.0|
|  3| 9.0|
|  4|16.0|
|  5|25.0|
+---+----+



# Maximum number in a dataframe

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max
spark = SparkSession.builder.appName('Max').getOrCreate()
data = [(1, ), (2, ), (3, ), (4, ), (5, )]
df = spark.createDataFrame(data, schema= ['no'])

# remember its df.agg
ndf = df.agg(max('no'))

ndf.show()
spark.stop()

24/10/02 18:07:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+-------+
|max(no)|
+-------+
|      5|
+-------+



# Average of data in dataframe

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
spark = SparkSession.builder.appName('avg').getOrCreate()
data = [(1, ), (2, ), (3, ), (4, ), (5, )]
df = spark.createDataFrame(data, schema= ['no'])

ndf = df.agg(avg('no'))

ndf.show()
spark.stop()

24/10/02 18:07:31 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+-------+
|avg(no)|
+-------+
|    3.0|
+-------+



# Reading data from a csv file into a dataframe

In [29]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('read').getOrCreate()
df = spark.read.csv('data.csv', header = True)
df.show()
df.show(2)
spark.stop()

24/10/02 18:07:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+---+---+
|  a|  b|
+---+---+
|  1|  2|
|  3|  4|
|  5|  6|
+---+---+

+---+---+
|  a|  b|
+---+---+
|  1|  2|
|  3|  4|
+---+---+
only showing top 2 rows



# Data stats summary

In [30]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('describe').getOrCreate()
df = spark.read.csv('data.csv')
df.describe().show()
df.summary().show()
df.select('_c0').summary().show()
spark.stop()

24/10/02 18:07:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+-------+---+---+
|summary|_c0|_c1|
+-------+---+---+
|  count|  4|  4|
|   mean|3.0|4.0|
| stddev|2.0|2.0|
|    min|  1|  2|
|    max|  a|  b|
+-------+---+---+

+-------+---+---+
|summary|_c0|_c1|
+-------+---+---+
|  count|  4|  4|
|   mean|3.0|4.0|
| stddev|2.0|2.0|
|    min|  1|  2|
|    25%|1.0|2.0|
|    50%|3.0|4.0|
|    75%|5.0|6.0|
|    max|  a|  b|
+-------+---+---+

+-------+---+
|summary|_c0|
+-------+---+
|  count|  4|
|   mean|3.0|
| stddev|2.0|
|    min|  1|
|    25%|1.0|
|    50%|3.0|
|    75%|5.0|
|    max|  a|
+-------+---+



# Word Count using DataFrames

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName('word count using dataframes').getOrCreate()
df = spark.read.text('text.txt')
df.show()
words = df.withColumn('word', f.explode(f.split(f.col('value'), ' '))).groupBy('word').count().sort('count', ascending = False).show()
spark.stop()

24/10/02 18:07:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+--------------------+
|               value|
+--------------------+
|This is a test te...|
+--------------------+

+-------+-----+
|   word|count|
+-------+-----+
|     is|    2|
|  viva.|    1|
|   name|    1|
|    for|    1|
|    lab|    1|
|   file|    1|
|    the|    1|
|Suchit.|    1|
|    bda|    1|
|     My|    1|
|   This|    1|
|   text|    1|
|      a|    1|
|   test|    1|
+-------+-----+



# Group By operations demo

In [32]:
from pyspark.sql import SparkSession

sc = SparkSession.builder.appName('demo').getOrCreate()

data = [('a', 1), ('a', 1), ('a', 1), ('b', 1), ('b', 1), ('b', 1)]

df = sc.createDataFrame(data, schema = ['group', 'value'])
df.show()

new_df1 = df.groupBy('group').count()
new_df2 = df.groupBy('group').sum()
new_df3 = df.groupBy('group').avg()

new_df1.show()
new_df2.show()
new_df3.show()
sc.stop()

24/10/02 18:07:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


+-----+-----+
|group|value|
+-----+-----+
|    a|    1|
|    a|    1|
|    a|    1|
|    b|    1|
|    b|    1|
|    b|    1|
+-----+-----+

+-----+-----+
|group|count|
+-----+-----+
|    a|    3|
|    b|    3|
+-----+-----+

+-----+----------+
|group|sum(value)|
+-----+----------+
|    a|         3|
|    b|         3|
+-----+----------+

+-----+----------+
|group|avg(value)|
+-----+----------+
|    a|       1.0|
|    b|       1.0|
+-----+----------+

