In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc=SparkContext(appName="DataFrame_d1") 
spark=SparkSession.builder.appName('d1').getOrCreate()

In [23]:
staff=[('mike',30,'finance',24000),('lee',34,'develop',36000),('allen',36,'manager',40000)] 

In [24]:
df_staff=spark.createDataFrame(staff,['staff','age','dept','salary'])

In [25]:
#from pyspark.sql import Column

In [26]:
type(df_staff.staff)

pyspark.sql.column.Column

In [27]:
df_staff.salary

Column<salary>

In [28]:
df_staff.select(df_staff.staff.alias('name'),df_staff.dept).collect()

[Row(name=u'mike', dept=u'finance'),
 Row(name=u'lee', dept=u'develop'),
 Row(name=u'allen', dept=u'manager')]

In [34]:
salary_asc=df_staff.select(df_staff.staff,df_staff.salary).orderBy(df_staff.staff.asc())

In [35]:
type(salary_asc)

pyspark.sql.dataframe.DataFrame

In [36]:
salary_asc.collect()

[Row(staff=u'allen', salary=40000),
 Row(staff=u'lee', salary=36000),
 Row(staff=u'mike', salary=24000)]

In [37]:
df=spark.createDataFrame([('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])

In [38]:
df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()

[Row(name=None), Row(name=u'Alice'), Row(name=u'Tom')]

In [39]:
d=df_staff.select(df_staff.staff,df_staff.age.cast('string'))

In [40]:
d.collect()

[Row(staff=u'mike', age=u'30'),
 Row(staff=u'lee', age=u'34'),
 Row(staff=u'allen', age=u'36')]

In [43]:
df_staff.select(df_staff.staff,df_staff.staff.contains('ee')).collect()

[Row(staff=u'mike', contains(staff, ee)=False),
 Row(staff=u'lee', contains(staff, ee)=True),
 Row(staff=u'allen', contains(staff, ee)=False)]

In [49]:
df_staff.select(df_staff.staff,df_staff.dept.like('deve%')).show()

+-----+---------------+
|staff|dept LIKE deve%|
+-----+---------------+
| mike|          false|
|  lee|           true|
|allen|          false|
+-----+---------------+



In [52]:
df_staff.select(df_staff.staff,df_staff.salary,df_staff.staff.startswith('m')).collect()

[Row(staff=u'mike', salary=24000, startswith(staff, m)=True),
 Row(staff=u'lee', salary=36000, startswith(staff, m)=False),
 Row(staff=u'allen', salary=40000, startswith(staff, m)=False)]

In [54]:
df_staff.filter(df_staff.staff.endswith('e')).collect()

[Row(staff=u'mike', age=30, dept=u'finance', salary=24000),
 Row(staff=u'lee', age=34, dept=u'develop', salary=36000)]

In [60]:
from pyspark.sql import Row
row = Row( name='苹果',unit_price=20, amount=10)

In [61]:
type(row)

pyspark.sql.types.Row

In [62]:
row

Row(amount=10, name='\xe8\x8b\xb9\xe6\x9e\x9c', unit_price=20)

In [64]:
row_dict=row.asDict()

In [65]:
type(row_dict)

dict

In [66]:
row_dict

{'amount': 10, 'name': '\xe8\x8b\xb9\xe6\x9e\x9c', 'unit_price': 20}

In [74]:
df=spark.createDataFrame([Row(no=1, value='foo'), Row(no=2, value=None),Row(no=2, value='fun')])

In [80]:
df.collect()

[Row(no=1, value=u'foo'), Row(no=2, value=None), Row(no=2, value=u'fun')]

In [72]:
df.getItem('no')

['no', 'value']

In [77]:
row_1=df.collect()[0]

In [78]:
type(row_1)

pyspark.sql.types.Row

In [79]:
'no' in row_1

True

In [81]:
'name' in row_1

False

In [84]:
from pyspark.sql import functions as F

In [96]:
df.select(df.no,df.value,F.when(df.no > 1,1).otherwise(0)).collect()

[Row(no=1, value=u'foo', CASE WHEN (no > 1) THEN 1 ELSE 0 END=0),
 Row(no=2, value=None, CASE WHEN (no > 1) THEN 1 ELSE 0 END=1),
 Row(no=2, value=u'fun', CASE WHEN (no > 1) THEN 1 ELSE 0 END=1)]

In [98]:
row_1.asDict()

{'no': 1, 'value': u'foo'}

In [106]:
data = spark.sparkContext.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 23, 'blue')])

In [107]:
from pyspark.sql.types import *

In [105]:
schema = StructType([
        StructField("id", LongType(), True),    
        StructField("name", StringType(), True),
        StructField("age", LongType(), True),
        StructField("eyeColor", StringType(), True)
    ])

In [108]:
df = spark.createDataFrame(data, schema)  # 创建 DataFrame，并指定schema

In [109]:
df.schema

StructType(List(StructField(id,LongType,true),StructField(name,StringType,true),StructField(age,LongType,true),StructField(eyeColor,StringType,true)))

In [114]:
d1=[('red', 'wood', [100, 200, 20])]

In [115]:
schema=StructType([
             StructField('door_color', StringType()),
             StructField('door_material', StringType()),
             StructField('door_param', ArrayType(IntegerType()))])

In [116]:
df2 = spark.createDataFrame(d1, schema)  # 创建 DataFrame，并指定schema

In [117]:
df2.show()

+----------+-------------+--------------+
|door_color|door_material|    door_param|
+----------+-------------+--------------+
|       red|         wood|[100, 200, 20]|
+----------+-------------+--------------+



In [119]:
schema.jsonValue()

{'fields': [{'metadata': {},
   'name': 'door_color',
   'nullable': True,
   'type': 'string'},
  {'metadata': {},
   'name': 'door_material',
   'nullable': True,
   'type': 'string'},
  {'metadata': {},
   'name': 'door_param',
   'nullable': True,
   'type': {'containsNull': True, 'elementType': 'integer', 'type': 'array'}}],
 'type': 'struct'}

In [120]:
schema.needConversion()

True

In [122]:
d2=[('red', 'wood', {'door_width':100}, {'door_height':200}, {'door_thickness':20})]

In [126]:
schema=StructType([
             StructField('door_color', StringType()),
             StructField('door_material', StringType()),
             StructField("width", MapType(StringType(), IntegerType(), False), False),
             StructField("height", MapType(StringType(), IntegerType(), False), False),
             StructField("thickness", MapType(StringType(), IntegerType(), False), False)
])

In [127]:
df3 = spark.createDataFrame(d2, schema)  # 创建 DataFrame，并指定schema

In [129]:
df3.collect()

[Row(door_color=u'red', door_material=u'wood', width={u'door_width': 100}, height={u'door_height': 200}, thickness={u'door_thickness': 20})]

In [131]:
schema.simpleString()

'struct<door_color:string,door_material:string,width:map<string,int>,height:map<string,int>,thickness:map<string,int>>'

In [132]:
df3.printSchema()

root
 |-- door_color: string (nullable = true)
 |-- door_material: string (nullable = true)
 |-- width: map (nullable = false)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)
 |-- height: map (nullable = false)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)
 |-- thickness: map (nullable = false)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = false)



In [133]:
df3.storageLevel

StorageLevel(False, False, False, False, 1)

In [134]:
df3.columns

['door_color', 'door_material', 'width', 'height', 'thickness']

In [135]:
df3.dtypes

[('door_color', 'string'),
 ('door_material', 'string'),
 ('width', 'map<string,int>'),
 ('height', 'map<string,int>'),
 ('thickness', 'map<string,int>')]

In [139]:
df3.rdd.collect()

[Row(door_color=u'red', door_material=u'wood', width={u'door_width': 100}, height={u'door_height': 200}, thickness={u'door_thickness': 20})]

In [140]:
df3.isStreaming

False