In [1]:
import pandas as pd
# Spark Context must be initialised as 'sc' variable
from pyspark.sql import functions as F

## Create a dataframe

In [16]:
#### Pandas ####
pdf = pd.DataFrame.from_items([("A", [1,2,3]), ("B", [11, 22, 33])])
pdf = pd.DataFrame({"A": (1,2,3), "B": (11, 22, 33)})
pdf

Unnamed: 0,A,B
0,1,11
1,2,22
2,3,33


In [7]:
#### Spark ####
# With sql context
sdf = sqlCtx.createDataFrame([(1, 11), (2, 22), (3, 33)], ["A", "B"])
sdf.show()

+---+---+
|  A|  B|
+---+---+
|  1| 11|
|  2| 22|
|  3| 33|
+---+---+



# Count the number of rows and columns

In [143]:
#### Pandas ####
print(len(pdf))  # nb of rows
print(len(pdf.columns))  # nb of columns

3
4


In [145]:
#### Spark ####
print(sdf.count())  # nb of rows
print(len(sdf.columns))  # nb of columns

3
3


## Select a column

In [78]:
#### Pandas ####
pdf['A']
pdf.A

0    1
1    2
2    3
Name: A, dtype: int64

In [164]:
pdf[["A", "C"]]

Unnamed: 0,A,C
0,1,False
1,2,True
2,3,False


In [83]:
pdf["O"] = 'o'
cols = list(pdf.loc[:,'A':'C']) + ['O']
pdf[cols]

Unnamed: 0,A,B,C,O
0,1,11,True,o
1,2,22,True,o
2,3,33,True,o


In [10]:
#### Spark ####
sdf["A"]
sdf.A

Column<A>

In [219]:
sdf[["A", "C"]].show()

+---+-----+
|  A|    C|
+---+-----+
|  1|false|
|  2| true|
|  3|false|
+---+-----+



In [218]:
sdf.select(["A", "C"]).show()

+---+-----+
|  A|    C|
+---+-----+
|  1|false|
|  2| true|
|  3|false|
+---+-----+



# Column adding

### add a constant value

In [8]:
#### Pandas ####
pdf["C"] = 0
pdf

Unnamed: 0,A,B,C
0,1,11,0
1,2,22,0
2,3,33,0


In [39]:
#### Spark ####
sdf.withColumn("C", F.lit(0)).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1| 11|  0|
|  2| 22|  0|
|  3| 33|  0|
+---+---+---+



### add an arbitrary range of values

In [40]:
#### Pandas ####
pdf["C"] = range(0, len(pdf))
pdf

Unnamed: 0,A,B,C,D
0,1,11,0,0
1,2,22,0,1
2,3,33,0,2


#### Spark ####
## PROBLEM !
# check out this : https://stackoverflow.com/questions/36132899/pyspark-add-a-column-to-dataframe-when-column-is-a-list

### add a column derived from another column

In [17]:
#### Pandas ####
# 1
pdf["C"] = pdf.A * 2
pdf

Unnamed: 0,A,B,C
0,1,11,2
1,2,22,4
2,3,33,6


In [19]:
# 2
pdf["C"] = pdf.A % 2 == 0
pdf

Unnamed: 0,A,B,C
0,1,11,False
1,2,22,True
2,3,33,False


In [64]:
#### Spark ####
# 1
# multiple ways to reference a column
# sdf.withColumn("C", F.column("A") * 2).show()
# sdf.withColumn("C", F.col("A") * 2).show()
# sdf.withColumn("C", sdf["A"] * 2).show()
sdf.withColumn("C", sdf.A * 2).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  1| 11|  2|
|  2| 22|  4|
|  3| 33|  6|
+---+---+---+



In [9]:
# 2
sdf = sdf.withColumn("C", sdf.A % 2 == 0)
sdf.show()

+---+---+-----+
|  A|  B|    C|
+---+---+-----+
|  1| 11|false|
|  2| 22| true|
|  3| 33|false|
+---+---+-----+



# Filtering

In [18]:
#### Pandas ####
pdf[(pdf.A % 3 == 0) | (pdf['C'] )]

Unnamed: 0,A,B,C
0,1,11,2
1,2,22,4
2,3,33,6


In [134]:
pdf.loc[1:, ("B", "C")]

Unnamed: 0,B,C
1,22,True
2,33,False


In [122]:
#### Spark ####
sdf.filter((sdf.A % 3 == 0) | sdf["C"]).show()

+---+---+-----+
|  A|  B|    C|
+---+---+-----+
|  2| 22| true|
|  3| 33|false|
+---+---+-----+



In [127]:
sdf.filter((sdf.A % 3 == 0) | sdf["C"]).select(F.col("A") % 3, sdf.A % 3 == 0).show()

+-------+-------------+
|(A % 3)|((A % 3) = 0)|
+-------+-------------+
|      2|        false|
|      0|         true|
+-------+-------------+



# Get the summary statistics

In [152]:
#### Pandas ####
pdf.describe()

Unnamed: 0,A,B
count,3.0,3.0
mean,2.0,22.0
std,1.0,11.0
min,1.0,11.0
25%,1.5,16.5
50%,2.0,22.0
75%,2.5,27.5
max,3.0,33.0


In [26]:
pdf.quantile([.10, .25, .50, .75, .90])

Unnamed: 0,A,B,C
0.1,1.2,13.2,0.0
0.25,1.5,16.5,0.0
0.5,2.0,22.0,0.0
0.75,2.5,27.5,0.5
0.9,2.8,30.8,0.8


In [151]:
#### Spark ####
sdf.describe().show()

+-------+---+----+
|summary|  A|   B|
+-------+---+----+
|  count|  3|   3|
|   mean|2.0|22.0|
| stddev|1.0|11.0|
|    min|  1|  11|
|    max|  3|  33|
+-------+---+----+



### PROBLEM! There is no build-in function in Spark for computing quantiles

# Aggregations

In [172]:
#### Pandas ####
pdf.groupby(["C"]).sum()

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4,44
True,2,22


In [184]:
pdf.groupby("C").agg({"A": "sum", "B": "max"})

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4,33
True,2,22


In [179]:
#### Spark ####
# one aggregation
sdf.groupby("C").sum().show()

+-----+------+------+
|    C|sum(A)|sum(B)|
+-----+------+------+
| true|     2|    22|
|false|     4|    44|
+-----+------+------+



In [181]:
# 2
# multiple aggregations in one shot
sdf.groupby("C").agg(F.sum("A"), F.max("B")).show()

+-----+------+------+
|    C|sum(A)|max(B)|
+-----+------+------+
| true|     2|    22|
|false|     4|    33|
+-----+------+------+



In [11]:
# 3
# multiple aggregations in one shot using expressions
exprs = []

for col in sdf.columns :
    exprs.append(F.min(col).alias("MIN of "+ col))
    exprs.append(F.max(col).alias("MAX of "+ col))
    
print(exprs)

sdf.groupBy("C").agg(*exprs).show()

[Column<(min(A),mode=Complete,isDistinct=false) AS MIN of A#19>, Column<(max(A),mode=Complete,isDistinct=false) AS MAX of A#20>, Column<(min(B),mode=Complete,isDistinct=false) AS MIN of B#21>, Column<(max(B),mode=Complete,isDistinct=false) AS MAX of B#22>, Column<(min(C),mode=Complete,isDistinct=false) AS MIN of C#23>, Column<(max(C),mode=Complete,isDistinct=false) AS MAX of C#24>]
+-----+--------+--------+--------+--------+--------+--------+
|    C|MIN of A|MAX of A|MIN of B|MAX of B|MIN of C|MAX of C|
+-----+--------+--------+--------+--------+--------+--------+
| true|       2|       2|      22|      22|    true|    true|
|false|       1|       3|      11|      33|   false|   false|
+-----+--------+--------+--------+--------+--------+--------+



# Complex operations & Windows

In [5]:
from pyspark.sql.window import Window

sdf2 = sqlCtx.createDataFrame([(1, 4), (1, 11), (1,5), (2, 6), (2, 12), (3, 0)], ["A", "B"])
pdf2 = sdf2.toPandas()
sdf2.show()
pdf2

+---+---+
|  A|  B|
+---+---+
|  1|  4|
|  1| 11|
|  1|  5|
|  2|  6|
|  2| 12|
|  3|  0|
+---+---+



Unnamed: 0,A,B
0,1,4
1,1,11
2,1,5
3,2,6
4,2,12
5,3,0


### Compute a difference between rows of the same column

In [193]:
#### Pandas ####
pdf2["diff"] = pdf2.B.diff()
pdf2

Unnamed: 0,A,B,diff
0,1,4,
1,1,5,1.0
2,2,6,1.0
3,2,6,0.0
4,3,0,-6.0


In [217]:
#### Spark ####
window_over_A = Window.partitionBy("A").orderBy("B")
sdf2.withColumn("diff", F.lead("B").over(window_over_A) - sdf2.B).show()

+---+---+----+
|  A|  B|diff|
+---+---+----+
|  1|  4|   1|
|  1|  5|   6|
|  1| 11|null|
|  2|  6|   6|
|  2| 12|null|
|  3|  0|null|
+---+---+----+



### Offset a column with a lag

In [213]:
pdf2 = pdf2.sort_values(by=["A", "B"])
pdf2["B_offset"] = pdf2.B.shift()
pdf2

Unnamed: 0,A,B,B_offset
0,1,4,
2,1,5,4.0
1,1,11,5.0
3,2,6,11.0
4,2,12,6.0
5,3,0,12.0


In [215]:
sdf2.withColumn("B_offset", F.lead("B").over(window_over_A)).show()

+---+---+--------+
|  A|  B|B_offset|
+---+---+--------+
|  1|  4|       5|
|  1|  5|      11|
|  1| 11|    null|
|  2|  6|      12|
|  2| 12|    null|
|  3|  0|    null|
+---+---+--------+

