In [15]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

In [3]:
# create sparksession
ss = SparkSession \
    .builder \
    .appName("Manipulation DataFrame") \
    .master("local[2]") \
    .getOrCreate()

In [4]:
# load data
df = ss.read.json("./data/data.json")

## EDA

### 显示数据类型
显示数据类型的方法，可以使用 `printSchema()` 以及 `dtypes` 属性。前者是返回树形结构，后者是得到一个键值形式列表

In [5]:
# 显示 Schema
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- altitude: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- exact_location: long (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- indoor: long (nullable = true)
 |    |-- latitude: string (nullable = true)
 |    |-- longitude: string (nullable = true)
 |-- sampling_rate: string (nullable = true)
 |-- sensor: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- pin: string (nullable = true)
 |    |-- sensor_type: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- manufacturer: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- sensordatavalues: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- value: string (nullable = true)
 |    |    |-- value_type: string (nullable = true)
 |-- timestamp: string (nullab

In [11]:
df.dtypes

[('id', 'bigint'),
 ('location',
  'struct<altitude:string,country:string,exact_location:bigint,id:bigint,indoor:bigint,latitude:string,longitude:string>'),
 ('sampling_rate', 'string'),
 ('sensor',
  'struct<id:bigint,pin:string,sensor_type:struct<id:bigint,manufacturer:string,name:string>>'),
 ('sensordatavalues',
  'array<struct<id:bigint,value:string,value_type:string>>'),
 ('timestamp', 'string')]

### 显示前几行数据
有多种方法显示前几行数据，`head()`以及特殊的方法 `first()`、`show()` 等。需要注意返回的结果类型不是完全相同的。

In [6]:
df.head(2)

[Row(id=5756852209, location=Row(altitude='104.9', country='UA', exact_location=0, id=22256, indoor=1, latitude='50.51', longitude='30.798'), sampling_rate=None, sensor=Row(id=36214, pin='7', sensor_type=Row(id=9, manufacturer='various', name='DHT22')), sensordatavalues=[Row(id=12224991603, value='10.00', value_type='temperature'), Row(id=12224991604, value='50.70', value_type='humidity')], timestamp='2019-12-13 11:10:02'),
 Row(id=5756852208, location=Row(altitude='111.8', country='GB', exact_location=1, id=21003, indoor=0, latitude='53.87869338867', longitude='-1.45841360092'), sampling_rate=None, sensor=Row(id=34792, pin='11', sensor_type=Row(id=17, manufacturer='Bosch', name='BME280')), sensordatavalues=[Row(id=12224991602, value='96357.16', value_type='pressure'), Row(id=12224991605, value='1.93', value_type='temperature'), Row(id=12224991606, value='100.00', value_type='humidity'), Row(id=None, value='97702.38', value_type='pressure_at_sealevel')], timestamp='2019-12-13 11:10:02'

In [7]:
df.first()

Row(id=5756852209, location=Row(altitude='104.9', country='UA', exact_location=0, id=22256, indoor=1, latitude='50.51', longitude='30.798'), sampling_rate=None, sensor=Row(id=36214, pin='7', sensor_type=Row(id=9, manufacturer='various', name='DHT22')), sensordatavalues=[Row(id=12224991603, value='10.00', value_type='temperature'), Row(id=12224991604, value='50.70', value_type='humidity')], timestamp='2019-12-13 11:10:02')

In [9]:
df.show(n=2)

+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|        id|            location|sampling_rate|              sensor|    sensordatavalues|          timestamp|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
|5756852209|[104.9, UA, 0, 22...|         null|[36214, 7, [9, va...|[[12224991603, 10...|2019-12-13 11:10:02|
|5756852208|[111.8, GB, 1, 21...|         null|[34792, 11, [17, ...|[[12224991602, 96...|2019-12-13 11:10:02|
+----------+--------------------+-------------+--------------------+--------------------+-------------------+
only showing top 2 rows



### 显示数据列名称
通过显示 `schema` 和 `dtypes` 可以显示列名称，需要直接得到列名称可以直接通过 `columns` 属性获取

In [13]:
df.columns

['id', 'location', 'sampling_rate', 'sensor', 'sensordatavalues', 'timestamp']

### 缺失值统计
DataFrame 没有自带的检测是否为缺失值方法，需要通过调用 `functions` 模块中的方法可以进行统计缺失值

In [18]:
from pyspark.sql import functions
from pyspark.sql import types

In [33]:
df.select(functions.sum(functions.isnan("sampling_rate").cast("integer")).alias("sampling_rate")).show()

+-------------+
|sampling_rate|
+-------------+
|            0|
+-------------+

