In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('scratching').getOrCreate()

In [5]:
# NUMERIC TYPES IN SPARK
# df = spark.range(0,20,1,1)

# BIGINT (8-byte signed integer, from -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807)
# Spark type: `long` >> types.LongType
# df_bigint = df.selectExpr("cast(id as BIGINT) big_id")
# df_bigint.show()
# df_bigint.printSchema()

# TINYINT (1-byte signed integer, from -128 to 127)
# Spark type: `byte` >> types.ByteType
# df_tinyint = df.selectExpr("cast(id as TINYINT) tiny_id")
# df_tinyint.show()
# df_tinyint.printSchema()

# SMALLINT (2-byte signed integer, from -32,768 to 32,767)
# Spark type: `short` >> types.ShortType
# df_smallint = df.selectExpr("cast(id as SMALLINT) small_id")
# df_smallint.show()
# df_smallint.printSchema()

# INT (4-byte signed integer, from -2,147,483,648 to 2,147,483,647)
# Spark type: `integer` >> types.IntegerType
# df_int = df.selectExpr("cast(id as INT) int_id")
# df_int.show()
# df_int.printSchema()

# FLOAT (4-byte single precision floating point number)
# Spark type: `float` >> types.FloatType
# df_float = df.selectExpr("cast(id as FLOAT) float_id")
# df_float.show()
# df_float.printSchema()

# DOUBLE (8-byte double precision floating point number)
# Spark type: `double` >> types.DoubleType
# df_double = df.selectExpr("cast(id as DOUBLE) double_id")
# df_double.show()
# df_double.printSchema()

# DECIMAL (Hive 0.13.0 introduced user definable precision and scale)
# Spark type: `decimal(10,1)` >> types.DecimalType
# df_decimal = df.selectExpr("cast(id as DECIMAL(10,1)) decimal_id")
# df_decimal.show()
# df_decimal.printSchema()

# All numeric types together in one DataFrame
spark.range(1,10,1,1).createOrReplaceTempView("ids")
df_numerics = spark.sql(
    """
    SELECT
        id,
        cast(id as BIGINT) big_id,
        cast(id as TINYINT) tiny_id,
        cast(id as SMALLINT) small_id,
        cast(id as INT) int_id,
        cast(id as FLOAT) float_id,
        cast(id as DOUBLE) double_id,
        cast(id as DECIMAL(10,1)) decimal_id
    FROM ids
    """
)
df_numerics.show()
df_numerics.printSchema()

df_numerics.describe().show()
df_numerics.groupBy().sum().show()
print(df_numerics.take(3))

+---+------+-------+--------+------+--------+---------+----------+
| id|big_id|tiny_id|small_id|int_id|float_id|double_id|decimal_id|
+---+------+-------+--------+------+--------+---------+----------+
|  1|     1|      1|       1|     1|     1.0|      1.0|       1.0|
|  2|     2|      2|       2|     2|     2.0|      2.0|       2.0|
|  3|     3|      3|       3|     3|     3.0|      3.0|       3.0|
|  4|     4|      4|       4|     4|     4.0|      4.0|       4.0|
|  5|     5|      5|       5|     5|     5.0|      5.0|       5.0|
|  6|     6|      6|       6|     6|     6.0|      6.0|       6.0|
|  7|     7|      7|       7|     7|     7.0|      7.0|       7.0|
|  8|     8|      8|       8|     8|     8.0|      8.0|       8.0|
|  9|     9|      9|       9|     9|     9.0|      9.0|       9.0|
+---+------+-------+--------+------+--------+---------+----------+

root
 |-- id: long (nullable = false)
 |-- big_id: long (nullable = false)
 |-- tiny_id: byte (nullable = false)
 |-- small_id: 

In [10]:
df_numerics.approxQuantile('id', [0.5], 0)

[5.0]

In [11]:
df_numerics.corr('id', 'tiny_id')

1.0

In [13]:
df_numerics.columns

['id',
 'big_id',
 'tiny_id',
 'small_id',
 'int_id',
 'float_id',
 'double_id',
 'decimal_id']

In [19]:
df = df_numerics.select('id', 'big_id')
print(df.columns)
df = df.select('id')
df

['id', 'big_id']


DataFrame[id: bigint]

In [18]:
df = df_numerics.cov('id', 'big_id')
df

7.5

In [23]:
rdd = df_numerics.toJSON()

In [26]:
df_numerics.columns

['id',
 'big_id',
 'tiny_id',
 'small_id',
 'int_id',
 'float_id',
 'double_id',
 'decimal_id']

In [29]:
iter_df = df_numerics.toLocalIterator()

In [35]:
df_numerics.storageLevel

StorageLevel(False, False, False, False, 1)

In [36]:
df_numerics.schema

StructType(List(StructField(id,LongType,false),StructField(big_id,LongType,false),StructField(tiny_id,ByteType,false),StructField(small_id,ShortType,false),StructField(int_id,IntegerType,false),StructField(float_id,FloatType,false),StructField(double_id,DoubleType,false),StructField(decimal_id,DecimalType(10,1),true)))

In [39]:
df_unp = df_numerics.unpersist()

In [41]:
df_unp.show()

+---+------+-------+--------+------+--------+---------+----------+
| id|big_id|tiny_id|small_id|int_id|float_id|double_id|decimal_id|
+---+------+-------+--------+------+--------+---------+----------+
|  1|     1|      1|       1|     1|     1.0|      1.0|       1.0|
|  2|     2|      2|       2|     2|     2.0|      2.0|       2.0|
|  3|     3|      3|       3|     3|     3.0|      3.0|       3.0|
|  4|     4|      4|       4|     4|     4.0|      4.0|       4.0|
|  5|     5|      5|       5|     5|     5.0|      5.0|       5.0|
|  6|     6|      6|       6|     6|     6.0|      6.0|       6.0|
|  7|     7|      7|       7|     7|     7.0|      7.0|       7.0|
|  8|     8|      8|       8|     8|     8.0|      8.0|       8.0|
|  9|     9|      9|       9|     9|     9.0|      9.0|       9.0|
+---+------+-------+--------+------+--------+---------+----------+



In [45]:
df_unp.rdd.id()

63

In [47]:
df_unp.count()

9

In [48]:
df_unp.isStreaming

False

In [52]:
df_numerics.freqItems(['id', 'small_id']).show()

+--------------------+--------------------+
|        id_freqItems|  small_id_freqItems|
+--------------------+--------------------+
|[8, 2, 5, 4, 7, 1...|[8, 2, 5, 4, 7, 1...|
+--------------------+--------------------+



In [55]:
df_numerics.crosstab('id', 'small_id').show()

+-----------+---+---+---+---+---+---+---+---+---+
|id_small_id|  1|  2|  3|  4|  5|  6|  7|  8|  9|
+-----------+---+---+---+---+---+---+---+---+---+
|          5|  0|  0|  0|  0|  1|  0|  0|  0|  0|
|          1|  1|  0|  0|  0|  0|  0|  0|  0|  0|
|          6|  0|  0|  0|  0|  0|  1|  0|  0|  0|
|          9|  0|  0|  0|  0|  0|  0|  0|  0|  1|
|          2|  0|  1|  0|  0|  0|  0|  0|  0|  0|
|          7|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|          3|  0|  0|  1|  0|  0|  0|  0|  0|  0|
|          8|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|          4|  0|  0|  0|  1|  0|  0|  0|  0|  0|
+-----------+---+---+---+---+---+---+---+---+---+



In [56]:
df_numerics.cache()

DataFrame[id: bigint, big_id: bigint, tiny_id: tinyint, small_id: smallint, int_id: int, float_id: float, double_id: double, decimal_id: decimal(10,1)]