In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python data audit example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
df = spark.read.format('com.databricks.spark.csv') \
            .options(header='true', inferschema='true') \
            .load("../data/bank.csv",header=True);
df.drop('day','month','poutcome').show(5)

+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
|age|        job|marital|education|default|balance|housing|loan| contact|duration|campaign|pdays|previous|  y|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
| 30| unemployed|married|  primary|     no|   1787|     no|  no|cellular|      79|       1|   -1|       0| no|
| 33|   services|married|secondary|     no|   4789|    yes| yes|cellular|     220|       1|  339|       4| no|
| 35| management| single| tertiary|     no|   1350|    yes|  no|cellular|     185|       1|  330|       1| no|
| 30| management|married| tertiary|     no|   1476|    yes| yes| unknown|     199|       4|   -1|       0| no|
| 59|blue-collar|married|secondary|     no|      0|    yes|  no| unknown|     226|       1|   -1|       0| no|
+---+-----------+-------+---------+-------+-------+-------+----+--------+--------+--------+-----+--------+---+
o

In [3]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [26]:
types = [f.dataType for f in df.schema.fields]

AttributeError: 'IntegerType' object has no attribute 'encode'

In [29]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [30]:
out = spark.createDataFrame(df.dtypes).toDF('Names','Types')

In [31]:
out.show()

+---------+------+
|    Names| Types|
+---------+------+
|      age|   int|
|      job|string|
|  marital|string|
|education|string|
|  default|string|
|  balance|   int|
|  housing|string|
|     loan|string|
|  contact|string|
|      day|   int|
|    month|string|
| duration|   int|
| campaign|   int|
|    pdays|   int|
| previous|   int|
| poutcome|string|
|        y|string|
+---------+------+



In [39]:
df.select('job').describe().show()

+-------+-------+
|summary|    job|
+-------+-------+
|  count|   4521|
|   mean|   null|
| stddev|   null|
|    min| admin.|
|    max|unknown|
+-------+-------+



In [47]:
from pyspark.sql.functions import mean, min, max

for i in df.columns:

    df.select([mean(i).alias('mean'), min(i).alias('min'), max(i).alias('max')]).show()

+-----------------+---+---+
|             mean|min|max|
+-----------------+---+---+
|41.17009511170095| 19| 87|
+-----------------+---+---+

+----+------+-------+
|mean|   min|    max|
+----+------+-------+
|null|admin.|unknown|
+----+------+-------+

+----+--------+------+
|mean|     min|   max|
+----+--------+------+
|null|divorced|single|
+----+--------+------+

+----+-------+-------+
|mean|    min|    max|
+----+-------+-------+
|null|primary|unknown|
+----+-------+-------+

+----+---+---+
|mean|min|max|
+----+---+---+
|null| no|yes|
+----+---+---+

+------------------+-----+-----+
|              mean|  min|  max|
+------------------+-----+-----+
|1422.6578190665782|-3313|71188|
+------------------+-----+-----+

+----+---+---+
|mean|min|max|
+----+---+---+
|null| no|yes|
+----+---+---+

+----+---+---+
|mean|min|max|
+----+---+---+
|null| no|yes|
+----+---+---+

+----+--------+-------+
|mean|     min|    max|
+----+--------+-------+
|null|cellular|unknown|
+----+--------+-------+

+

In [38]:
df.crosstab('age','default').show()

+-----------+---+---+
|age_default| no|yes|
+-----------+---+---+
|         69|  6|  0|
|         56| 72|  2|
|         42|138|  3|
|         24| 23|  1|
|         37|158|  3|
|         25| 43|  1|
|         52| 86|  0|
|         20|  3|  0|
|         46|119|  0|
|         57| 87|  4|
|         78|  3|  0|
|         29| 97|  0|
|         84|  1|  0|
|         61| 16|  0|
|         74|  3|  0|
|         60| 47|  0|
|         28|102|  1|
|         38|158|  1|
|         70|  7|  0|
|         21|  7|  0|
+-----------+---+---+
only showing top 20 rows



In [49]:
df = spark.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(100)], ["a", "b", "c"])
df.show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  1|  2|  1|
|  1|  2|  3|
|  3|  6|  3|
|  1|  2|  3|
|  5| 10|  1|
|  1|  2|  3|
|  7| 14|  3|
|  1|  2|  3|
|  9| 18|  1|
|  1|  2|  3|
| 11| 22|  3|
|  1|  2|  3|
| 13| 26|  1|
|  1|  2|  3|
| 15| 30|  3|
|  1|  2|  3|
| 17| 34|  1|
|  1|  2|  3|
| 19| 38|  3|
+---+---+---+
only showing top 20 rows



In [50]:
# Given the above DataFrame, the following code finds the
# frequent items that show up 40% of the time for each column:
freq = df.stat.freqItems(["a", "b", "c"], 0.4)
freq.show()

+-----------+-----------+-----------+
|a_freqItems|b_freqItems|c_freqItems|
+-----------+-----------+-----------+
|    [11, 1]|    [2, 22]|     [1, 3]|
+-----------+-----------+-----------+

