In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

In [6]:
spark = SparkSession.builder.appName("test1").getOrCreate()

In [7]:
s = [StructField("name", StringType(), True),
     StructField("age", IntegerType(), True),
     StructField("salary", DoubleType(), True)]


In [8]:
schema = StructType(s)

In [11]:
df = spark.createDataFrame(data=[("a", 1, 100.0), ("b", 2, 200.0)], schema=schema)
# 100 传给DoubleType会报错， 传100.0才行

In [13]:
df.select("name")

DataFrame[name: string]

In [14]:
df["name"]
# df["xxx"] 和 df.select("xxx") 不同select返回的是DataFrame，直接indexing返回的是column对象

Column<b'name'>

In [16]:
df.head(10)  # df.head() 默认返回一个， 可加参数 n，（n如果大于data个数不会报错，返回所有对象）

[Row(name='a', age=1, salary=100.0), Row(name='b', age=2, salary=200.0)]

In [17]:
df.describe()

DataFrame[summary: string, name: string, age: string, salary: string]

In [18]:
df.describe().show() # f返回DataFrame的基本统计数据

+-------+----+------------------+-----------------+
|summary|name|               age|           salary|
+-------+----+------------------+-----------------+
|  count|   2|                 2|                2|
|   mean|null|               1.5|            150.0|
| stddev|null|0.7071067811865476|70.71067811865476|
|    min|   a|                 1|            100.0|
|    max|   b|                 2|            200.0|
+-------+----+------------------+-----------------+



In [19]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: double (nullable = true)



In [20]:
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.functions import lit

# Deal with missing data

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
spark = SparkSession.builder.appName("test1").getOrCreate()

In [17]:
df = spark.createDataFrame(data = [('emp1', 'John', None), ('emp2', None, None), 
              ('emp1', None, 345.0), ('emp1', 'Cindy', 456.0)], schema = StructType([StructField("pos", StringType(), True),
     StructField("name", StringType(), True),
     StructField("salary", DoubleType(), True)]))

In [18]:
df.show()

+----+-----+------+
| pos| name|salary|
+----+-----+------+
|emp1| John|  null|
|emp2| null|  null|
|emp1| null| 345.0|
|emp1|Cindy| 456.0|
+----+-----+------+



In [19]:
df1 = df
df1.na.drop().show()

+----+-----+------+
| pos| name|salary|
+----+-----+------+
|emp1|Cindy| 456.0|
+----+-----+------+



In [23]:
df2 = df
df2.na.drop(thresh=2).show()

+----+-----+------+
| pos| name|salary|
+----+-----+------+
|emp1| John|  null|
|emp1| null| 345.0|
|emp1|Cindy| 456.0|
+----+-----+------+



In [24]:
df3 = df
df3.na.drop(subset=['salary']).show()

+----+-----+------+
| pos| name|salary|
+----+-----+------+
|emp1| null| 345.0|
|emp1|Cindy| 456.0|
+----+-----+------+



In [25]:
df.printSchema()
df.na.fill('FILL VALUE').show()

root
 |-- pos: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+----+----------+------+
| pos|      name|salary|
+----+----------+------+
|emp1|      John|  null|
|emp2|FILL VALUE|  null|
|emp1|FILL VALUE| 345.0|
|emp1|     Cindy| 456.0|
+----+----------+------+



In [26]:
df.printSchema()
df.na.fill(0).show()

root
 |-- pos: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+----+-----+------+
| pos| name|salary|
+----+-----+------+
|emp1| John|   0.0|
|emp2| null|   0.0|
|emp1| null| 345.0|
|emp1|Cindy| 456.0|
+----+-----+------+



In [27]:
df.printSchema()
df.na.fill('No name', subset=['Name']).show()

root
 |-- pos: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: double (nullable = true)

+----+-------+------+
| pos|   name|salary|
+----+-------+------+
|emp1|   John|  null|
|emp2|No name|  null|
|emp1|No name| 345.0|
|emp1|  Cindy| 456.0|
+----+-------+------+



In [28]:
from pyspark.sql.functions import mean

In [33]:
mean_val = df.select(mean(df['salary']).alias('avg')).show()

+-----+
|  avg|
+-----+
|400.5|
+-----+



In [36]:
df.na.fill(df.select(mean(df['salary'])).collect()[0][0], subset=['salary']).show()

+----+-----+------+
| pos| name|salary|
+----+-----+------+
|emp1| John| 400.5|
|emp2| null| 400.5|
|emp1| null| 345.0|
|emp1|Cindy| 456.0|
+----+-----+------+



# spark with dates and timestamps

In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, 
                                   month, year, weekofyear, format_number, date_format)

In [38]:
spark = SparkSession.builder.appName('dates_test').getOrCreate()

In [None]:
data = 