In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql.types import *
sc=SparkContext(appName="DataFrame_d2") 
spark=SparkSession.builder.appName('d2').getOrCreate()

In [3]:
staff=[(1,'mike',30,'finance',24000),(2,'lee',34,'develop',36000),(3,'allen',36,'manager',40000),(4,'jane',None,'CFO',None)] 
staff_schema = StructType([
        StructField("id", IntegerType(), True),    
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("job", StringType(), True),
        StructField("salary",LongType(),True)
])
staff_df=spark.createDataFrame(staff,staff_schema)

In [4]:
user=[(1,'mike','BeiJin','朝阳'),(2,None,'ShangHai','徐汇'),(3,'allen','GuangZhou','天河'),(4,'jane','ShenZhen','福田')]
user_schema=StructType([
        StructField("id", IntegerType(), True),    
        StructField("name", StringType(), True),
        StructField("city", StringType(), True),
        StructField("region", StringType(), True)
])
user_df=spark.createDataFrame(user,user_schema)

In [7]:
cross_data=staff_df.crossJoin(user_df)

In [8]:
cross_data.show()

+---+-----+----+-------+------+---+-----+---------+------+
| id| name| age|    job|salary| id| name|     city|region|
+---+-----+----+-------+------+---+-----+---------+------+
|  1| mike|  30|finance| 24000|  1| mike|   BeiJin|æé³|
|  1| mike|  30|finance| 24000|  2| null| ShangHai|å¾æ±|
|  1| mike|  30|finance| 24000|  3|allen|GuangZhou|å¤©æ²³|
|  1| mike|  30|finance| 24000|  4| jane| ShenZhen|ç¦ç°|
|  2|  lee|  34|develop| 36000|  1| mike|   BeiJin|æé³|
|  2|  lee|  34|develop| 36000|  2| null| ShangHai|å¾æ±|
|  2|  lee|  34|develop| 36000|  3|allen|GuangZhou|å¤©æ²³|
|  2|  lee|  34|develop| 36000|  4| jane| ShenZhen|ç¦ç°|
|  3|allen|  36|manager| 40000|  1| mike|   BeiJin|æé³|
|  3|allen|  36|manager| 40000|  2| null| ShangHai|å¾æ±|
|  3|allen|  36|manager| 40000|  3|allen|GuangZhou|å¤©æ²³|
|  3|allen|  36|manager| 40000|  4| jane| ShenZhen|ç¦ç°|
|  4| jane|null|    CFO|  null|  1| mike|   BeiJin|æé³|
|  4| jane|null|    CFO|  null|  2| null| ShangHai|å¾æ±

In [12]:
except_data=user_df.select('id','name').exceptAll(staff_df.select('id','name'))

In [13]:
except_data.show()

+---+----+
| id|name|
+---+----+
|  2|null|
+---+----+



In [16]:
user_df.select('id','name').intersectAll(staff_df.select('id','name')).show()

+---+-----+
| id| name|
+---+-----+
|  4| jane|
|  3|allen|
|  1| mike|
+---+-----+



In [18]:
staff_df.join(user_df,staff_df.id==user_df.id,'left').select(staff_df.name,staff_df.salary,user_df.city).show()

+-----+------+---------+
| name|salary|     city|
+-----+------+---------+
| mike| 24000|   BeiJin|
|allen| 40000|GuangZhou|
| jane|  null| ShenZhen|
|  lee| 36000| ShangHai|
+-----+------+---------+



In [23]:
staff_df.agg({'salary':'min'}).collect()
from pyspark.sql import functions as F
staff_df.agg(F.min(staff_df.salary)).collect()

[Row(min(salary)=24000)]

In [43]:
t=staff_df.select(staff_df.age.cast(DoubleType()),staff_df.salary.cast(DoubleType()) )

In [46]:
t.corr('age','salary')

0.9714027646697837

In [47]:
staff_df.corr('age','salary')

0.9714027646697837

In [48]:
staff_df.count()

4

In [49]:
staff_df.cov('age','salary')

294666.6666666667

In [50]:
t.cov('age','salary')

294666.6666666667

In [54]:
type(staff_df.cube('age'))

pyspark.sql.group.GroupedData

In [57]:
staff_df.describe().show()

+-------+------------------+-----+------------------+-------+------------------+
|summary|                id| name|               age|    job|            salary|
+-------+------------------+-----+------------------+-------+------------------+
|  count|                 4|    4|                 3|      4|                 3|
|   mean|               2.5| null|33.333333333333336|   null|33333.333333333336|
| stddev|1.2909944487358056| null| 3.055050463303893|   null|  8326.66399786453|
|    min|                 1|allen|                30|    CFO|             24000|
|    max|                 4| mike|                36|manager|             40000|
+-------+------------------+-----+------------------+-------+------------------+



In [59]:
staff_df.groupBy().avg().collect()

[Row(avg(id)=2.5, avg(age)=33.333333333333336, avg(salary)=33333.333333333336)]

In [64]:
staff_df.describe().show()
staff_df.sort('name','salary').show()

+-------+------------------+-----+------------------+-------+------------------+
|summary|                id| name|               age|    job|            salary|
+-------+------------------+-----+------------------+-------+------------------+
|  count|                 4|    4|                 3|      4|                 3|
|   mean|               2.5| null|33.333333333333336|   null|33333.333333333336|
| stddev|1.2909944487358056| null| 3.055050463303893|   null|  8326.66399786453|
|    min|                 1|allen|                30|    CFO|             24000|
|    max|                 4| mike|                36|manager|             40000|
+-------+------------------+-----+------------------+-------+------------------+

+---+-----+----+-------+------+
| id| name| age|    job|salary|
+---+-----+----+-------+------+
|  3|allen|  36|manager| 40000|
|  4| jane|null|    CFO|  null|
|  2|  lee|  34|develop| 36000|
|  1| mike|  30|finance| 24000|
+---+-----+----+-------+------+



In [65]:
staff_df.summary("count", "min", "25%", "75%", "max").show()

+-------+---+-----+---+-------+------+
|summary| id| name|age|    job|salary|
+-------+---+-----+---+-------+------+
|  count|  4|    4|  3|      4|     3|
|    min|  1|allen| 30|    CFO| 24000|
|    25%|  1| null| 30|   null| 24000|
|    75%|  3| null| 36|   null| 40000|
|    max|  4| mike| 36|manager| 40000|
+-------+---+-----+---+-------+------+

