In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf

sparkSession = SparkSession.builder.appName('DF test') \
                            .config(conf=SparkConf() \
                                   .setMaster('yarn')) \
                                    .enableHiveSupport() \
                                    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/30 17:45:18 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [2]:
df = sparkSession.read.parquet('/Datasets/health')
df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|           1|           1|03/06/2020|     97|        1| 72|      97|       2|   2|     2|      2|           1|            2|             2|      1|            1|      2|                   5| 97|  2|
|    2|           1|           2|09/06/2020|      1|        2| 55|      97|       1|   2|     2|      2|           2|            2|             2|      2|            2|      2|

                                                                                

#### pyspark.sql.DataFrame.agg
Aggregate on the entire DataFrame without groups (shorthand for df.groupBy().agg()).

In [6]:
df.agg({'AGE':'max'}).show()

+--------+
|max(AGE)|
+--------+
|     121|
+--------+



In [7]:
df.agg({'AGE':'avg'}).show()

+------------------+
|          avg(AGE)|
+------------------+
|41.794102472403026|
+------------------+



In [8]:
df.agg(F.count(df.AGE)).show()

+----------+
|count(AGE)|
+----------+
|   1048575|
+----------+



#### pyspark.sql.DataFrame.alias
This DF alias is useful while doing join operation, bcoz in both DF sometime we have same names of the column

In [10]:
df = df.alias('Patient')

In [11]:
df.select('Patient.AGE', 'Patient.SEX').show(5)

+---+---+
|AGE|SEX|
+---+---+
| 72|  2|
| 55|  2|
| 68|  2|
| 24|  2|
| 30|  2|
+---+---+
only showing top 5 rows



In [12]:
df.select('AGE', 'SEX').show(5)

+---+---+
|AGE|SEX|
+---+---+
| 72|  2|
| 55|  2|
| 68|  2|
| 24|  2|
| 30|  2|
+---+---+
only showing top 5 rows




#### pyspark.sql.DataFrame.approxQuantile

Can be a single column name, or a list of names for multiple columns.

a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
        relativeErrorfloat
The relative target precision to achieve (>= 0). If set to zero, the exact quantiles are computed, which could be very expensive. Note that values greater than 1 are accepted but give the same result as 1.

the approximate quantiles at the given probabilities. If the input col is a string, the output is a list of floats. If the input col is a list or tuple of strings, the output is also a list, but each element in it is a list of floats, i.e., the output is a list of list of floats.



In [14]:
df.approxQuantile('AGE',[0.50],0)

                                                                                

[40.0]

In [16]:
df.approxQuantile('AGE',[0.25],0)

                                                                                

[30.0]

In [17]:
df.approxQuantile('AGE',[0.75],0)

                                                                                

[53.0]

#### pyspark.sql.DataFrame.cache
Persists the DataFrame with the default storage level (MEMORY_AND_DISK).

In [20]:
df.cache()

22/12/29 14:35:13 WARN CacheManager: Asked to cache already cached data.


DataFrame[USMER: int, MEDICAL_UNIT: int, PATIENT_TYPE: int, DATE_DIED: string, INTUBED: int, PNEUMONIA: int, AGE: int, PREGNANT: int, DIABETES: int, COPD: int, ASTHMA: int, INMSUPR: int, HIPERTENSION: int, OTHER_DISEASE: int, CARDIOVASCULAR: int, OBESITY: int, RENAL_CHRONIC: int, TOBACCO: int, CLASIFFICATION_FINAL: int, ICU: int, SEX: int]

#### pyspark.sql.DataFrame.checkpoint
Returns a checkpointed version of this DataFrame. Checkpointing can be used to truncate the logical plan of this DataFrame, which is especially useful in iterative algorithms where the plan may grow exponentially. It will be saved to files inside the checkpoint directory set with SparkContext.setCheckpointDir().

In [22]:
sparkSession.sparkContext.setCheckpointDir('/Spark_checkpointing_dir/')

22/12/29 14:36:37 WARN SparkContext: Spark is not running in local mode, therefore the checkpoint directory must not be on the local filesystem. Directory '/Spark_checkpointing_dir/' appears to be on the local filesystem.


In [23]:
df.checkpoint()

                                                                                

DataFrame[USMER: int, MEDICAL_UNIT: int, PATIENT_TYPE: int, DATE_DIED: string, INTUBED: int, PNEUMONIA: int, AGE: int, PREGNANT: int, DIABETES: int, COPD: int, ASTHMA: int, INMSUPR: int, HIPERTENSION: int, OTHER_DISEASE: int, CARDIOVASCULAR: int, OBESITY: int, RENAL_CHRONIC: int, TOBACCO: int, CLASIFFICATION_FINAL: int, ICU: int, SEX: int]

#### coalesce()

RDD coalesce() is used only to reduce the number of partitions. This is an optimized or improved version of repartition() where the movement of the data across the partitions is lower using coalesce.

Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions. If a larger number of partitions is requested, it will stay at the current number of partitions.

However, if you’re doing a drastic coalesce, e.g. to numPartitions = 1, this may result in your computation taking place on fewer nodes than you like (e.g. one node in the case of numPartitions = 1). To avoid this, you can call repartition(). This will add a shuffle step, but means the current upstream partitions will be executed in parallel (per whatever the current partitioning is).

In [25]:
df.rdd.getNumPartitions()

2

In [26]:
df.rdd.coalesce(1).getNumPartitions()

1

#### pyspark.sql.DataFrame.colRegex¶
Selects column based on the column name specified as a regex and returns it as 

In [33]:
df.select(df.colRegex("`SE+.`")).show()

+---+
|SEX|
+---+
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
|  2|
+---+
only showing top 20 rows



#### pyspark.sql.DataFrame.collect
Returns all the records as a list of Row.

In [35]:
df.collect()

                                                                                

[Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=1, DATE_DIED='03/06/2020', INTUBED=97, PNEUMONIA=1, AGE=72, PREGNANT=97, DIABETES=2, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=1, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=1, RENAL_CHRONIC=1, TOBACCO=2, CLASIFFICATION_FINAL=5, ICU=97, SEX=2),
 Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=2, DATE_DIED='09/06/2020', INTUBED=1, PNEUMONIA=2, AGE=55, PREGNANT=97, DIABETES=1, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=2, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=2, RENAL_CHRONIC=2, TOBACCO=2, CLASIFFICATION_FINAL=3, ICU=2, SEX=2),
 Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=1, DATE_DIED='21/06/2020', INTUBED=97, PNEUMONIA=2, AGE=68, PREGNANT=97, DIABETES=1, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=1, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=2, RENAL_CHRONIC=2, TOBACCO=2, CLASIFFICATION_FINAL=3, ICU=97, SEX=2),
 Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=2, DATE_DIED='9999-99-99', INTUBED=2, PNEUMONIA=2, AGE=24, PREGNANT=97, DIABETES=2, COPD=2, ASTHMA

#### pyspark.sql.DataFrame.columns
Returns all column names as a list.

In [36]:
df.columns

['USMER',
 'MEDICAL_UNIT',
 'PATIENT_TYPE',
 'DATE_DIED',
 'INTUBED',
 'PNEUMONIA',
 'AGE',
 'PREGNANT',
 'DIABETES',
 'COPD',
 'ASTHMA',
 'INMSUPR',
 'HIPERTENSION',
 'OTHER_DISEASE',
 'CARDIOVASCULAR',
 'OBESITY',
 'RENAL_CHRONIC',
 'TOBACCO',
 'CLASIFFICATION_FINAL',
 'ICU',
 'SEX']

#### pyspark.sql.DataFrame.corr
Calculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. DataFrame.corr() and DataFrameStatFunctions.corr() are aliases of each other.

In [40]:
df.corr('DIABETES','HIPERTENSION')

0.8350178504029976

In [41]:
df.corr('AGE','HIPERTENSION')

-0.00496418462374755

we can see +ve correlation for Diabetes and hypertension

#### pyspark.sql.DataFrame.count¶
Returns the number of rows in this DataFrame.

In [42]:
df.count()

1048575

#### pyspark.sql.DataFrame.cov¶
Calculate the sample covariance for the given columns, specified by their names, as a double value. DataFrame.cov() and DataFrameStatFunctions.cov() are aliases.

In [43]:
df.cov('AGE','HIPERTENSION')

-0.4394981469164739

In [44]:
df.cov('DIABETES','HIPERTENSION')

23.71741610231534

#### pyspark.sql.DataFrame.createGlobalTempView¶
Creates a global temporary view with this DataFrame.

    The lifetime of this temporary view is tied to this Spark application. throws TempTableAlreadyExistsException, if the view name already exists in the catalog.

In [52]:
#df.createGlobalTempView('global_tempview_patient')

In [51]:
sparkSession.sql("select AGE, SEX from global_temp.global_tempview_patient;").show()

+---+---+
|AGE|SEX|
+---+---+
| 72|  2|
| 55|  2|
| 68|  2|
| 24|  2|
| 30|  2|
| 55|  2|
| 61|  2|
| 54|  2|
| 59|  2|
| 30|  2|
| 45|  2|
| 24|  2|
| 32|  2|
| 49|  2|
| 39|  2|
| 27|  2|
| 45|  2|
| 25|  2|
| 37|  2|
| 45|  2|
+---+---+
only showing top 20 rows



In [54]:
sparkSession.sql("show tables from global_temp;").show(truncate=False)

+-----------+-----------------------+-----------+
|namespace  |tableName              |isTemporary|
+-----------+-----------------------+-----------+
|global_temp|global_tempview_patient|true       |
+-----------+-----------------------+-----------+



In [62]:
sparkSession.catalog.dropGlobalTempView("global_tempview_patient")

True

#### pyspark.sql.DataFrame.createOrReplaceGlobalTempView¶

Creates or replaces a global temporary view using the given name.

The lifetime of this temporary view is tied to this Spark application.

In [56]:
df.createOrReplaceGlobalTempView('global_tempview_patient')

In [57]:
sparkSession.sql("select AGE, SEX from global_temp.global_tempview_patient;").show(5)

+---+---+
|AGE|SEX|
+---+---+
| 72|  2|
| 55|  2|
| 68|  2|
| 24|  2|
| 30|  2|
+---+---+
only showing top 5 rows



#### pyspark.sql.DataFrame.createOrReplaceTempView
Creates or replaces a local temporary view with this DataFrame.

The lifetime of this temporary table is tied to the SparkSession that was used to create this DataFrame.

In [60]:
df.createOrReplaceTempView("patient12")

In [61]:
sparkSession.sql("select AGE, SEX from patient12;").show(5)

+---+---+
|AGE|SEX|
+---+---+
| 72|  2|
| 55|  2|
| 68|  2|
| 24|  2|
| 30|  2|
+---+---+
only showing top 5 rows



#### pyspark.sql.DataFrame.createTempView

Creates a local temporary view with this DataFrame.

The lifetime of this temporary table is tied to the SparkSession that was used to create this DataFrame. throws TempTableAlreadyExistsException, if the view name already exists in the catalog.

In [64]:
df.createTempView("patient12")

In [66]:
sparkSession.catalog.dropTempView("patient12")

True

#### pyspark.sql.DataFrame.crossJoin
As the number of fields is growing in each industry, in each Data sources. It is almost impossible to store all the variables in single Data table. So ideally we received Data tables in multiple files. In these situation, whenever there is a need to bring variables together in one table, merge or join is helpful. Cross join creates a table with cartesian product of observation between two tables. For each row of table 1, a mapping takes place with each row of table 2. The below article discusses how to Cross join Dataframes in Pyspark.

In [73]:
df1 = sparkSession.createDataFrame([('nilesh', 24),('pawan', 22)],schema=['Name','Age'])
df2 = sparkSession.createDataFrame([('math',1),('bio',2),('history',3)], schema=['Subject', 'Id'])

In [74]:
df2.show()

+-------+---+
|Subject| Id|
+-------+---+
|   math|  1|
|    bio|  2|
|history|  3|
+-------+---+



In [75]:
df1.show()

+------+---+
|  Name|Age|
+------+---+
|nilesh| 24|
| pawan| 22|
+------+---+



In [76]:
df1.crossJoin(df2).show()

+------+---+-------+---+
|  Name|Age|Subject| Id|
+------+---+-------+---+
|nilesh| 24|   math|  1|
|nilesh| 24|    bio|  2|
|nilesh| 24|history|  3|
| pawan| 22|   math|  1|
| pawan| 22|    bio|  2|
| pawan| 22|history|  3|
+------+---+-------+---+



#### pyspark.sql.DataFrame.crosstab
Computes a pair-wise frequency table of the given columns. Also known as a contingency table. The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero pair frequencies will be returned. The first column of each row will be the distinct values of col1 and the column names will be the distinct values of col2. The name of the first column will be col1_

col2. Pairs that have no occurrences will have zero as their counts. DataFrame.crosstab() and DataFrameStatFunctions.crosstab() are aliases.

In [79]:
df.crosstab('DIABETES',"HIPERTENSION").show()

+---------------------+-----+------+----+
|DIABETES_HIPERTENSION|    1|     2|  98|
+---------------------+-----+------+----+
|                    2|96545|823398| 305|
|                   98|  178|   463|2697|
|                    1|66006| 58881| 102|
+---------------------+-----+------+----+



#### pyspark.sql.DataFrame.cube
Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregations on them.

https://stackoverflow.com/questions/37975227/what-is-the-difference-between-cube-rollup-and-groupby-operators

In [82]:
df1.cube('Name','Age').count().show()

+------+----+-----+
|  Name| Age|count|
+------+----+-----+
|nilesh|null|    1|
|  null|null|    2|
|nilesh|  24|    1|
|  null|  24|    1|
| pawan|null|    1|
| pawan|  22|    1|
|  null|  22|    1|
+------+----+-----+



#### pyspark.sql.DataFrame.describe¶
This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical or string columns.

In [83]:
df.describe('AGE','SEX','DIABETES').show()

+-------+------------------+-------------------+------------------+
|summary|               AGE|                SEX|          DIABETES|
+-------+------------------+-------------------+------------------+
|  count|           1048575|            1048575|           1048575|
|   mean|41.794102472403026| 1.4992594711870872|2.1864044059795438|
| stddev|16.907389199431208|0.49999969003548544|  5.42424178788836|
|    min|                 0|                  1|                 1|
|    max|               121|                  2|                98|
+-------+------------------+-------------------+------------------+



In [84]:
df.describe().show()

                                                                                

+-------+------------------+-----------------+------------------+----------+-----------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-----------------+------------------+------------------+--------------------+-----------------+-------------------+
|summary|             USMER|     MEDICAL_UNIT|      PATIENT_TYPE| DATE_DIED|          INTUBED|         PNEUMONIA|               AGE|         PREGNANT|          DIABETES|              COPD|            ASTHMA|           INMSUPR|      HIPERTENSION|    OTHER_DISEASE|    CARDIOVASCULAR|          OBESITY|     RENAL_CHRONIC|           TOBACCO|CLASIFFICATION_FINAL|              ICU|                SEX|
+-------+------------------+-----------------+------------------+----------+-----------------+------------------+------------------+-----------------+------------------+------------------+----------------

#### pyspark.sql.DataFrame.distinct
Returns a new DataFrame containing the distinct rows in this DataFrame.

In [86]:
df.distinct().show()



+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|           2|           1|9999-99-99|     97|        2| 85|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   7| 97|  2|
|    2|           2|           1|9999-99-99|     97|        2| 71|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|

                                                                                

In [87]:
df.distinct().count()

                                                                                

236526

#### pyspark.sql.DataFrame.drop
Returns a new DataFrame that drops the specified column. This is a no-op if schema doesn’t contain the given column name(s).

In [88]:
df.columns

['USMER',
 'MEDICAL_UNIT',
 'PATIENT_TYPE',
 'DATE_DIED',
 'INTUBED',
 'PNEUMONIA',
 'AGE',
 'PREGNANT',
 'DIABETES',
 'COPD',
 'ASTHMA',
 'INMSUPR',
 'HIPERTENSION',
 'OTHER_DISEASE',
 'CARDIOVASCULAR',
 'OBESITY',
 'RENAL_CHRONIC',
 'TOBACCO',
 'CLASIFFICATION_FINAL',
 'ICU',
 'SEX']

In [89]:
df = df.drop('COPD')

In [90]:
df.columns

['USMER',
 'MEDICAL_UNIT',
 'PATIENT_TYPE',
 'DATE_DIED',
 'INTUBED',
 'PNEUMONIA',
 'AGE',
 'PREGNANT',
 'DIABETES',
 'ASTHMA',
 'INMSUPR',
 'HIPERTENSION',
 'OTHER_DISEASE',
 'CARDIOVASCULAR',
 'OBESITY',
 'RENAL_CHRONIC',
 'TOBACCO',
 'CLASIFFICATION_FINAL',
 'ICU',
 'SEX']

#### pyspark.sql.DataFrame.dropDuplicates¶
Return a new DataFrame with duplicate rows removed, optionally only considering certain columns.

For a static batch DataFrame, it just drops duplicate rows. For a streaming DataFrame, it will keep all data across triggers as intermediate state to drop duplicates rows. You can use withWatermark() to limit how late the duplicate data can be and system will accordingly limit the state. In addition, too late data older than watermark will be dropped to avoid any possibility of duplicates.

drop_duplicates() is an alias for dropDuplicates().

In [93]:
from pyspark.sql import Row

df = sparkSession.sparkContext.parallelize([ \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=10, height=80)]).toDF()

df.show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [94]:
df.dropDuplicates().show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [95]:
df.dropDuplicates(['name', 'height']).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
+-----+---+------+



#### pyspark.sql.DataFrame.dropna
Returns a new DataFrame omitting rows with null values. DataFrame.dropna() and DataFrameNaFunctions.drop() are aliases of each other.


        how str, optional

            ‘any’ or ‘all’. If ‘any’, drop a row if it contains any nulls. If ‘all’, drop a row only if all its values are null.
        thresh: int, optional

            default None If specified, drop rows that have less than thresh non-null values. This overwrites the how parameter.
        subsetstr, tuple or list, optional

            optional list of column names to consider.



In [98]:
df = sparkSession.sparkContext.parallelize([ \
    Row(name='Alice', age=5, height=80), \
    Row(name='Alice', age=None, height=80), \
    Row(name='Alice', age=10, height=80),
    Row(name='BOB', age=None, height=80), \
    Row(name=None, age=None, height=None)]).toDF()

df.show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|Alice|   5|    80|
|Alice|null|    80|
|Alice|  10|    80|
|  BOB|null|    80|
| null|null|  null|
+-----+----+------+



In [99]:
df.dropna().show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [101]:
df.dropna(how='all').show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|Alice|   5|    80|
|Alice|null|    80|
|Alice|  10|    80|
|  BOB|null|    80|
+-----+----+------+



In [102]:
df.dropna(how='any').show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



#### pyspark.sql.DataFrame.dtypes
Returns all column names and their data types as a list.

In [106]:
df.dtypes

[('name', 'string'), ('age', 'bigint'), ('height', 'bigint')]

In [107]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- height: long (nullable = true)



#### pyspark.sql.DataFrame.explain
Prints the (logical and physical) plans to the console for debugging purpose.

    Parameters

        extended bool, optional

            default False. If False, prints only the physical plan. When this is a string without specifying the mode, it works as the mode is specified.
        modestr, optional

            specifies the expected output format of plans.

                simple: Print only a physical plan.

                extended: Print both logical and physical plans.

                codegen: Print a physical plan and generated codes if they are available.

                cost: Print a logical plan and statistics if they are available.

                formatted: Split explain output into two sections: a physical plan outline and node details.

            

In [108]:
df.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[name#10975,age#10976L,height#10977L]




In [109]:
df.explain(mode='simple')

== Physical Plan ==
*(1) Scan ExistingRDD[name#10975,age#10976L,height#10977L]




In [110]:
df.explain(mode='extended')

== Parsed Logical Plan ==
LogicalRDD [name#10975, age#10976L, height#10977L], false

== Analyzed Logical Plan ==
name: string, age: bigint, height: bigint
LogicalRDD [name#10975, age#10976L, height#10977L], false

== Optimized Logical Plan ==
LogicalRDD [name#10975, age#10976L, height#10977L], false

== Physical Plan ==
*(1) Scan ExistingRDD[name#10975,age#10976L,height#10977L]



In [111]:
df.explain(mode='codegen')

Found 1 WholeStageCodegen subtrees.
== Subtree 1 / 1 (maxMethodCodeSize:252; maxConstantPoolSize:115(0.18% used); numInnerClasses:0) ==
*(1) Scan ExistingRDD[name#10975,age#10976L,height#10977L]

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage1(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=1
/* 006 */ final class GeneratedIteratorForCodegenStage1 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator rdd_input_0;
/* 010 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] rdd_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[1];
/* 011 */
/* 012 */   public GeneratedIteratorForCodegenStage1(Object[] references) {
/* 013 */     this.references = references;
/* 014 */   }
/* 01

#### pyspark.sql.DataFrame.fillna

In [112]:
df.show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|Alice|   5|    80|
|Alice|null|    80|
|Alice|  10|    80|
|  BOB|null|    80|
| null|null|  null|
+-----+----+------+



In [117]:
df.fillna(value='no name').show()

+-------+----+------+
|   name| age|height|
+-------+----+------+
|  Alice|   5|    80|
|  Alice|null|    80|
|  Alice|  10|    80|
|    BOB|null|    80|
|no name|null|  null|
+-------+----+------+



In [115]:
df.fillna(value=50).show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 50|    80|
|Alice| 10|    80|
|  BOB| 50|    80|
| null| 50|    50|
+-----+---+------+



In [118]:
df.na.fill({'name':'No Name', 'age':555, 'height':100}).show()

+-------+---+------+
|   name|age|height|
+-------+---+------+
|  Alice|  5|    80|
|  Alice|555|    80|
|  Alice| 10|    80|
|    BOB|555|    80|
|No Name|555|   100|
+-------+---+------+



In [119]:
df.na.drop(how='any').show()

+-----+---+------+
| name|age|height|
+-----+---+------+
|Alice|  5|    80|
|Alice| 10|    80|
+-----+---+------+



In [120]:
df.na.drop(how='all').show()

+-----+----+------+
| name| age|height|
+-----+----+------+
|Alice|   5|    80|
|Alice|null|    80|
|Alice|  10|    80|
|  BOB|null|    80|
+-----+----+------+



#### pyspark.sql.DataFrame.filter
Filters rows using the given condition.

    where() is an alias for filter().

In [122]:
df.select('*').filter(df.AGE >50).show()

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|           1|           1|03/06/2020|     97|        1| 72|      97|       2|   2|     2|      2|           1|            2|             2|      1|            1|      2|                   5| 97|  2|
|    2|           1|           2|09/06/2020|      1|        2| 55|      97|       1|   2|     2|      2|           2|            2|             2|      2|            2|      2|

In [123]:
df.filter(df.SEX == 2).count()

523511

In [124]:
df.filter(df.SEX != 2).count()

525064

In [125]:
df.where(df.AGE > 30).count()

767388

In [127]:
df.filter((df.SEX == 2)&(df.AGE > 30)).count()

386763

In [128]:
df.filter((df.SEX == 2)|(df.AGE > 30)).count()

904136

#### pyspark.sql.DataFrame.first

In [130]:
df.first()

Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=1, DATE_DIED='03/06/2020', INTUBED=97, PNEUMONIA=1, AGE=72, PREGNANT=97, DIABETES=2, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=1, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=1, RENAL_CHRONIC=1, TOBACCO=2, CLASIFFICATION_FINAL=5, ICU=97, SEX=2)

#### pyspark.sql.DataFrame.foreach¶
Applies the f function to all Row of this DataFrame.

    This is a shorthand for df.rdd.foreach()

In [132]:
accum = sparkSession.sparkContext.accumulator(0)
df.foreach(lambda data: accum.add(data.AGE))

                                                                                

In [134]:
accum.value

43824251

#### pyspark.sql.DataFrame.groupBy
Groups the DataFrame using the specified columns, so we can run aggregation on them. See GroupedData for all the available aggregate functions.

    groupby() is an alias for groupBy().

In [3]:
data = sparkSession.read.csv('/Datasets/alcohol.csv', header=True, inferSchema=True)
data.show(2)

                                                                                

+---------+-----------------+--------------------+----------------------+---------------+---------------+------------------+----------------+---------------+---------------+
|  country|total_consumption|recorded_consumption|unrecorded_consumption|beer_percentage|wine_percentage|spirits_percentage|other_percentage|2020_projection|2025_projection|
+---------+-----------------+--------------------+----------------------+---------------+---------------+------------------+----------------+---------------+---------------+
|  Estonia|             16.9|                15.8|                   1.1|           32.7|            7.4|              50.3|             9.6|           11.5|           11.9|
|Lithuania|             15.0|                13.8|                   1.2|           43.6|            7.3|              37.1|            12.1|           14.4|           13.9|
+---------+-----------------+--------------------+----------------------+---------------+---------------+------------------+------

In [4]:
data.groupBy('country').count().show()

[Stage 5:>                                                          (0 + 1) / 1]

+-----------+-----+
|    country|count|
+-----------+-----+
|       Chad|    1|
|     Russia|    1|
|   Paraguay|    1|
|      Yemen|    1|
|    Senegal|    1|
|     Sweden|    1|
|   Kiribati|    1|
|     Guyana|    1|
|Philippines|    1|
|    Eritrea|    1|
|      Tonga|    1|
|   Djibouti|    1|
|  Singapore|    1|
|   Malaysia|    1|
|       Fiji|    1|
|     Turkey|    1|
|     Malawi|    1|
|       Iraq|    1|
|    Germany|    1|
|    Comoros|    1|
+-----------+-----+
only showing top 20 rows



                                                                                

In [6]:
df.columns

['USMER',
 'MEDICAL_UNIT',
 'PATIENT_TYPE',
 'DATE_DIED',
 'INTUBED',
 'PNEUMONIA',
 'AGE',
 'PREGNANT',
 'DIABETES',
 'COPD',
 'ASTHMA',
 'INMSUPR',
 'HIPERTENSION',
 'OTHER_DISEASE',
 'CARDIOVASCULAR',
 'OBESITY',
 'RENAL_CHRONIC',
 'TOBACCO',
 'CLASIFFICATION_FINAL',
 'ICU',
 'SEX']

In [7]:
df.groupBy('SEX').count().show()

+---+------+
|SEX| count|
+---+------+
|  1|525064|
|  2|523511|
+---+------+



In [12]:
df.groupBy('SEX').agg({'AGE':'max'}).show()



+---+--------+
|SEX|max(AGE)|
+---+--------+
|  1|     121|
|  2|     120|
+---+--------+



                                                                                

In [13]:
df.groupBy('SEX').agg({'AGE':'min'}).show()

+---+--------+
|SEX|min(AGE)|
+---+--------+
|  1|       0|
|  2|       0|
+---+--------+



In [14]:
df.groupBy('SEX').agg({'AGE':'avg'}).show()

+---+------------------+
|SEX|          avg(AGE)|
+---+------------------+
|  1|41.315197766367525|
|  2|42.274427853473945|
+---+------------------+



In [15]:
df.groupBy('SEX','DIABETES').count().show()



+---+--------+------+
|SEX|DIABETES| count|
+---+--------+------+
|  2|       2|457318|
|  1|       2|462930|
|  1|       1| 60745|
|  2|      98|  1949|
|  2|       1| 64244|
|  1|      98|  1389|
+---+--------+------+



In [19]:
df.groupBy('SEX','DIABETES').agg({'AGE':'max'}).show()

+---+--------+--------+
|SEX|DIABETES|max(AGE)|
+---+--------+--------+
|  2|       2|     120|
|  1|       2|     121|
|  1|       1|     108|
|  2|      98|     101|
|  2|       1|     120|
|  1|      98|     102|
+---+--------+--------+



In [20]:
df.groupBy('SEX','DIABETES').agg({'AGE':'avg'}).show()

+---+--------+------------------+
|SEX|DIABETES|          avg(AGE)|
+---+--------+------------------+
|  2|       2| 40.16971997603418|
|  1|       2| 39.25069664960145|
|  1|       1| 56.88099432052021|
|  2|      98|49.321190354027706|
|  2|       1| 57.04291451341759|
|  1|      98| 48.64074874010079|
+---+--------+------------------+



#### pyspark.sql.DataFrame.head
Returns the first n rows.

In [21]:
df.head()

Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=1, DATE_DIED='03/06/2020', INTUBED=97, PNEUMONIA=1, AGE=72, PREGNANT=97, DIABETES=2, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=1, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=1, RENAL_CHRONIC=1, TOBACCO=2, CLASIFFICATION_FINAL=5, ICU=97, SEX=2)

In [22]:
df.head(5)

[Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=1, DATE_DIED='03/06/2020', INTUBED=97, PNEUMONIA=1, AGE=72, PREGNANT=97, DIABETES=2, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=1, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=1, RENAL_CHRONIC=1, TOBACCO=2, CLASIFFICATION_FINAL=5, ICU=97, SEX=2),
 Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=2, DATE_DIED='09/06/2020', INTUBED=1, PNEUMONIA=2, AGE=55, PREGNANT=97, DIABETES=1, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=2, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=2, RENAL_CHRONIC=2, TOBACCO=2, CLASIFFICATION_FINAL=3, ICU=2, SEX=2),
 Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=1, DATE_DIED='21/06/2020', INTUBED=97, PNEUMONIA=2, AGE=68, PREGNANT=97, DIABETES=1, COPD=2, ASTHMA=2, INMSUPR=2, HIPERTENSION=1, OTHER_DISEASE=2, CARDIOVASCULAR=2, OBESITY=2, RENAL_CHRONIC=2, TOBACCO=2, CLASIFFICATION_FINAL=3, ICU=97, SEX=2),
 Row(USMER=2, MEDICAL_UNIT=1, PATIENT_TYPE=2, DATE_DIED='9999-99-99', INTUBED=2, PNEUMONIA=2, AGE=24, PREGNANT=97, DIABETES=2, COPD=2, ASTHMA

#### pyspark.sql.DataFrame.intersect
Return a new DataFrame containing rows only in both this DataFrame and another DataFrame.
This is equivalent to INTERSECT in SQL.

In [34]:
df1 = sparkSession.createDataFrame([('nilesh',23),('pawan', 21),('ramesh',23),('ram', 21)], schema=['Name','Age'])
df2 = sparkSession.createDataFrame([('rakesh',24),('pawan', 21),('nilesh',23),('raj', 21)], schema=['Name','Age'])

In [35]:
df1.intersect(df2).show()

+------+---+
|  Name|Age|
+------+---+
| pawan| 21|
|nilesh| 23|
+------+---+



#### pyspark.sql.DataFrame.intersectAll
Return a new DataFrame containing rows in both this DataFrame and another DataFrame while preserving duplicates.
This is equivalent to INTERSECT ALL in SQL. As standard in SQL, this function resolves columns by position (not by name).

In [36]:
df1.intersectAll(df2).show()

+------+---+
|  Name|Age|
+------+---+
| pawan| 21|
|nilesh| 23|
+------+---+



#### pyspark.sql.DataFrame.isEmpty

In [37]:
df1.isEmpty()

False

#### pyspark.sql.DataFrame.join

PySpark Join is used to combine two DataFrames and by chaining these you can join multiple DataFrames; it supports all basic join type operations available in traditional SQL like INNER, LEFT OUTER, RIGHT OUTER, LEFT ANTI, LEFT SEMI, CROSS, SELF JOIN. PySpark Joins are wider transformations that involve data shuffling across the network.

PySpark SQL Joins comes with more optimization by default (thanks to DataFrames) however still there would be some performance issues to consider while using. 



join() operation takes parameters as below and returns DataFrame.

    param other: Right side of the join
    param on: a string for the join column name
    param how: default inner. Must be one of inner, cross, outer,full, full_outer, left, left_outer, right, right_outer,left_semi, and left_anti.

You can also write Join expression by adding where() and filter() methods on DataFrame and can have Join on multiple columns. 

#### PySpark Inner Join DataFrame

Inner join is the default join in PySpark and it’s mostly used. This joins two datasets on key columns, where keys don’t match the rows get dropped from both datasets (emp & dept). 

In [4]:

emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = sparkSession.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = sparkSession.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)


root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



                                                                                

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [42]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'inner').show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [43]:
empDF.join(other=deptDF, on='dept_id', how='inner').show() # keys names are different, need same keys name

AnalysisException: USING column `dept_id` cannot be resolved on the left side of the join. The left-side columns: [emp_id, name, superior_emp_id, year_joined, emp_dept_id, gender, salary]

#### PySpark Full Outer Join

Outer a.k.a full, fullouter join returns all rows from both datasets, where join expression doesn’t match it returns null on respective record columns. 

In [44]:
empDF.join(deptDF, empDF.emp_dept_id==deptDF.dept_id, 'outer').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  null|    null|           null|       null|       null|  null|  null|    Sales|     30|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [45]:
empDF.join(deptDF, empDF.emp_dept_id==deptDF.dept_id, 'fullouter').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  null|    null|           null|       null|       null|  null|  null|    Sales|     30|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [46]:
empDF.join(deptDF, empDF.emp_dept_id==deptDF.dept_id, 'full').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  null|    null|           null|       null|       null|  null|  null|    Sales|     30|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



#### PySpark Left Outer Join

Left a.k.a Leftouter join returns all rows from the left dataset regardless of match found on the right dataset when join expression doesn’t match, it assigns null for that record and drops records from right where match not found. 

In [47]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'left').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [48]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'leftouter').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



#### Right Outer Join

Right a.k.a Rightouter join is opposite of left join, here it returns all rows from the right dataset regardless of math found on the left dataset, when join expression doesn’t match, it assigns null for that record and drops records from left where match not found. 

In [49]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'right').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  null|    null|           null|       null|       null|  null|  null|    Sales|     30|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [50]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'rightouter').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  null|    null|           null|       null|       null|  null|  null|    Sales|     30|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



#### Left Semi Join

leftsemi join is similar to inner join difference being leftsemi join returns all columns from the left dataset and ignores all columns from the right dataset. In other words, this join returns columns from the only left dataset for the records match in the right dataset on join expression, records not matched on join expression are ignored from both left and right datasets.

The same result can be achieved using select on the result of the inner join however, using this join would be efficient. 

In [51]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'leftsemi').show()

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|
|     3|Williams|              1|       2010|         10|     M|  1000|
|     4|   Jones|              2|       2005|         10|     F|  2000|
|     2|    Rose|              1|       2010|         20|     M|  4000|
|     5|   Brown|              2|       2010|         40|      |    -1|
+------+--------+---------------+-----------+-----------+------+------+



#### Left Anti Join

leftanti join does the exact opposite of the leftsemi, leftanti join returns only columns from the left dataset for non-matched records.

In [53]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, 'leftanti').show()

+------+-----+---------------+-----------+-----------+------+------+
|emp_id| name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|     6|Brown|              2|       2010|         50|      |    -1|
+------+-----+---------------+-----------+-----------+------+------+



#### PySpark Self Join

Joins are not complete without a self join, Though there is no self-join type available, we can use any of the above-explained join types to join DataFrame to itself. below example use inner self join.

In [54]:
empDF.join(empDF, on='emp_id', how='inner').show()

+------+--------+---------------+-----------+-----------+------+------+--------+---------------+-----------+-----------+------+------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+--------+---------------+-----------+-----------+------+------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|   Smith|             -1|       2018|         10|     M|  3000|
|     2|    Rose|              1|       2010|         20|     M|  4000|    Rose|              1|       2010|         20|     M|  4000|
|     3|Williams|              1|       2010|         10|     M|  1000|Williams|              1|       2010|         10|     M|  1000|
|     4|   Jones|              2|       2005|         10|     F|  2000|   Jones|              2|       2005|         10|     F|  2000|
|     5|   Brown|              2|       2010|         4

#### pyspark.sql.DataFrame.limit¶
Limits the result count to the number specified.

In [56]:
df.limit(2).show()

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|           1|           1|03/06/2020|     97|        1| 72|      97|       2|   2|     2|      2|           1|            2|             2|      1|            1|      2|                   5| 97|  2|
|    2|           1|           2|09/06/2020|      1|        2| 55|      97|       1|   2|     2|      2|           2|            2|             2|      2|            2|      2|

#### pyspark.sql.DataFrame.orderBy

In [58]:
df.sort(df.AGE).show(5)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    1|           2|           2|9999-99-99|      2|        2|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   7|  2|  2|
|    1|           3|           2|9999-99-99|      2|        1|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|



In [60]:
df.sort(df.AGE.desc()).show(5)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|          12|           1|9999-99-99|     97|        2|121|       2|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   6| 97|  1|
|    2|          12|           1|9999-99-99|     97|        2|120|      98|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|

In [62]:
df.sort('AGE', ascending=True).show(5)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    1|           2|           2|9999-99-99|      2|        2|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   7|  2|  2|
|    1|           3|           2|9999-99-99|      2|        1|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|



In [63]:
df.sort('AGE', ascending=False).show(5)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|          12|           1|9999-99-99|     97|        2|121|       2|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   6| 97|  1|
|    2|          12|           1|9999-99-99|     97|        2|120|      98|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|

In [64]:
df.orderBy(df.AGE).show(5)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    1|           2|           2|9999-99-99|      2|        2|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   7|  2|  2|
|    1|           3|           2|9999-99-99|      2|        1|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|

In [66]:
df.orderBy(df.AGE.desc()).show(5)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|          12|           1|9999-99-99|     97|        2|121|       2|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   6| 97|  1|
|    2|          12|           1|9999-99-99|     97|        2|120|      98|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|

In [67]:
data.columns

['country',
 'total_consumption',
 'recorded_consumption',
 'unrecorded_consumption',
 'beer_percentage',
 'wine_percentage',
 'spirits_percentage',
 'other_percentage',
 '2020_projection',
 '2025_projection']

In [68]:
data.orderBy(data.recorded_consumption, ascedning=False).show(5)

+-----------+-----------------+--------------------+----------------------+---------------+---------------+------------------+----------------+---------------+---------------+
|    country|total_consumption|recorded_consumption|unrecorded_consumption|beer_percentage|wine_percentage|spirits_percentage|other_percentage|2020_projection|2025_projection|
+-----------+-----------------+--------------------+----------------------+---------------+---------------+------------------+----------------+---------------+---------------+
|Afghanistan|              0.2|                 0.0|                   0.2|           null|           null|              null|            null|            0.2|            0.2|
|      Libya|              0.0|                 0.0|                   0.0|           null|           null|              null|            null|            0.0|            0.0|
|      Yemen|              0.1|                 0.0|                   0.0|           89.5|            0.0|             

In [69]:
data.orderBy(data.recorded_consumption, ascedning=True).show(5)

+-----------+-----------------+--------------------+----------------------+---------------+---------------+------------------+----------------+---------------+---------------+
|    country|total_consumption|recorded_consumption|unrecorded_consumption|beer_percentage|wine_percentage|spirits_percentage|other_percentage|2020_projection|2025_projection|
+-----------+-----------------+--------------------+----------------------+---------------+---------------+------------------+----------------+---------------+---------------+
|Afghanistan|              0.2|                 0.0|                   0.2|           null|           null|              null|            null|            0.2|            0.2|
|      Libya|              0.0|                 0.0|                   0.0|           null|           null|              null|            null|            0.0|            0.0|
|      Yemen|              0.1|                 0.0|                   0.0|           89.5|            0.0|             

#### pyspark.sql.DataFrame.persist¶
Sets the storage level to persist the contents of the DataFrame across operations after the first time it is computed. This can only be used to assign a new storage level if the DataFrame does not have a storage level set yet. If no storage level is specified defaults to (MEMORY_AND_DISK_DESER)

In [70]:
from pyspark.storagelevel import StorageLevel

In [72]:
df = df.persist(storageLevel=StorageLevel.MEMORY_ONLY)

22/12/30 15:34:55 WARN CacheManager: Asked to cache already cached data.


In [73]:
df = df.persist(storageLevel=StorageLevel.MEMORY_ONLY_2)

22/12/30 15:35:18 WARN CacheManager: Asked to cache already cached data.


#### pyspark.sql.DataFrame.printSchema
Prints out the schema in the tree format.

In [74]:
df.printSchema()

root
 |-- USMER: integer (nullable = true)
 |-- MEDICAL_UNIT: integer (nullable = true)
 |-- PATIENT_TYPE: integer (nullable = true)
 |-- DATE_DIED: string (nullable = true)
 |-- INTUBED: integer (nullable = true)
 |-- PNEUMONIA: integer (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- PREGNANT: integer (nullable = true)
 |-- DIABETES: integer (nullable = true)
 |-- COPD: integer (nullable = true)
 |-- ASTHMA: integer (nullable = true)
 |-- INMSUPR: integer (nullable = true)
 |-- HIPERTENSION: integer (nullable = true)
 |-- OTHER_DISEASE: integer (nullable = true)
 |-- CARDIOVASCULAR: integer (nullable = true)
 |-- OBESITY: integer (nullable = true)
 |-- RENAL_CHRONIC: integer (nullable = true)
 |-- TOBACCO: integer (nullable = true)
 |-- CLASIFFICATION_FINAL: integer (nullable = true)
 |-- ICU: integer (nullable = true)
 |-- SEX: integer (nullable = true)



In [75]:
data.printSchema()

root
 |-- country: string (nullable = true)
 |-- total_consumption: double (nullable = true)
 |-- recorded_consumption: double (nullable = true)
 |-- unrecorded_consumption: double (nullable = true)
 |-- beer_percentage: double (nullable = true)
 |-- wine_percentage: double (nullable = true)
 |-- spirits_percentage: double (nullable = true)
 |-- other_percentage: double (nullable = true)
 |-- 2020_projection: double (nullable = true)
 |-- 2025_projection: double (nullable = true)



#### pyspark.sql.DataFrame.randomSplit

    Randomly splits this DataFrame with the provided weights.

    Parameters

        weights list

            list of doubles as weights with which to split the DataFrame. Weights will be normalized if they don’t sum up to 1.0.
        seedint, optional

            The seed for sampling.



In [76]:
df4, df5 = df.randomSplit(weights=[0.10,0.90], seed=26)

In [78]:
df4.show(2)

[Stage 170:>                                                        (0 + 1) / 1]

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    1|           2|           1|9999-99-99|     97|        2|  0|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   6| 97|  2|
|    1|           2|           2|9999-99-99|      2|        1| 10|       2|       2|   2|     2|      2|           2|            1|             1|      2|            2|      2|

                                                                                

In [79]:
df5.show(3)

[Stage 171:>                                                        (0 + 1) / 1]

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    1|           2|           1|9999-99-99|     97|        2|  1|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   6| 97|  2|
|    1|           2|           1|9999-99-99|     97|        2|  2|      97|       2|   2|     2|      2|           2|            2|             2|      2|            2|      2|

                                                                                

In [80]:
df4.count()

                                                                                

104801

In [81]:
df5.count()

                                                                                

943774

#### pyspark.sql.DataFrame.repartition
Returns a new DataFrame partitioned by the given partitioning expressions. The resulting DataFrame is hash partitioned.


        numPartitionsint

            can be an int to specify the target number of partitions or a Column. If it is a Column, it will be used as the first partitioning column. If not specified, the default number of partitions is used.
        colsstr or Column

            partitioning columns.

            Changed in version 1.6: Added optional arguments to specify the partitioning columns. Also made numPartitions optional if partitioning columns are specified.



In [82]:
df = df.repartition(10, df.AGE)

In [83]:
df.rdd.getNumPartitions()

10

In [86]:
df = df.repartition(df.SEX)
df.rdd.getNumPartitions()



2

In [87]:
df = df.repartition(5)
df.rdd.getNumPartitions()



5

In [88]:
df = df.repartition(df.DIABETES)
df.rdd.getNumPartitions()



2

#### pyspark.sql.DataFrame.replace

    Returns a new DataFrame replacing a value with another value. DataFrame.replace() and DataFrameNaFunctions.replace() are aliases of each other. Values to_replace and value must have the same type and can only be numerics, booleans, or strings. Value can have None. When replacing, the new value will be cast to the type of the existing column. For numeric replacements all values to be replaced should have unique floating point representation. In case of conflicts (for example with {42: -1, 42.0: 1}) and arbitrary replacement will be used.

In [89]:
df1.show()

+------+---+
|  Name|Age|
+------+---+
|nilesh| 23|
| pawan| 21|
|ramesh| 23|
|   ram| 21|
+------+---+



                                                                                

In [90]:
df1.replace(to_replace='nilesh', value='purvi').show()

+------+---+
|  Name|Age|
+------+---+
| purvi| 23|
| pawan| 21|
|ramesh| 23|
|   ram| 21|
+------+---+



In [91]:
df1.replace(23, 10).show()

+------+---+
|  Name|Age|
+------+---+
|nilesh| 10|
| pawan| 21|
|ramesh| 10|
|   ram| 21|
+------+---+



#### pyspark.sql.DataFrame.sample¶
Returns a sampled subset of this DataFrame.

In [92]:
df.sample(fraction=0.10, seed=26).show()

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|           1|           2|09/06/2020|      1|        2| 55|      97|       1|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   3|  2|  2|
|    1|           3|           2|03/05/2020|      2|        1| 60|      97|       1|   2|     2|      2|           2|            2|             2|      2|            2|      2|



#### pyspark.sql.DataFrame.schema¶
Returns the schema of this DataFrame as a pyspark.sql.types.StructType.

In [94]:
df.schema

StructType([StructField('USMER', IntegerType(), True), StructField('MEDICAL_UNIT', IntegerType(), True), StructField('PATIENT_TYPE', IntegerType(), True), StructField('DATE_DIED', StringType(), True), StructField('INTUBED', IntegerType(), True), StructField('PNEUMONIA', IntegerType(), True), StructField('AGE', IntegerType(), True), StructField('PREGNANT', IntegerType(), True), StructField('DIABETES', IntegerType(), True), StructField('COPD', IntegerType(), True), StructField('ASTHMA', IntegerType(), True), StructField('INMSUPR', IntegerType(), True), StructField('HIPERTENSION', IntegerType(), True), StructField('OTHER_DISEASE', IntegerType(), True), StructField('CARDIOVASCULAR', IntegerType(), True), StructField('OBESITY', IntegerType(), True), StructField('RENAL_CHRONIC', IntegerType(), True), StructField('TOBACCO', IntegerType(), True), StructField('CLASIFFICATION_FINAL', IntegerType(), True), StructField('ICU', IntegerType(), True), StructField('SEX', IntegerType(), True)])

In [95]:
df1.schema

StructType([StructField('Name', StringType(), True), StructField('Age', LongType(), True)])

#### pyspark.sql.DataFrame.select
Projects a set of expressions and returns a new DataFrame.

In [96]:
df.select('*').show(3)



+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE| DATE_DIED|INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|    2|           1|           2|09/06/2020|      1|        2| 55|      97|       1|   2|     2|      2|           2|            2|             2|      2|            2|      2|                   3|  2|  2|
|    2|           1|           1|21/06/2020|     97|        2| 68|      97|       1|   2|     2|      2|           1|            2|             2|      2|            2|      2|

                                                                                

In [97]:
df.select(df.AGE, df.SEX).show(6)

+---+---+
|AGE|SEX|
+---+---+
| 55|  2|
| 68|  2|
| 59|  2|
| 30|  2|
| 62|  2|
| 56|  2|
+---+---+
only showing top 6 rows



In [98]:
df.select('AGE','SEX').show(2)

+---+---+
|AGE|SEX|
+---+---+
| 55|  2|
| 68|  2|
+---+---+
only showing top 2 rows



#### pyspark.sql.DataFrame.show
Prints the first n rows to the console.

In [110]:
df.show(5)

Py4JJavaError: An error occurred while calling o598.showString.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.lang.Thread.run(Thread.java:750)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:120)
	at org.apache.spark.SparkContext.defaultParallelism(SparkContext.scala:2524)
	at org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.$anonfun$apply$1(CoalesceShufflePartitions.scala:60)
	at scala.runtime.java8.JFunction0$mcI$sp.apply(JFunction0$mcI$sp.java:23)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.apply(CoalesceShufflePartitions.scala:57)
	at org.apache.spark.sql.execution.adaptive.CoalesceShufflePartitions.apply(CoalesceShufflePartitions.scala:33)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$optimizeQueryStage$1(AdaptiveSparkPlanExec.scala:155)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.optimizeQueryStage(AdaptiveSparkPlanExec.scala:154)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.newQueryStage(AdaptiveSparkPlanExec.scala:530)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:491)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:521)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:521)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$createQueryStages$2(AdaptiveSparkPlanExec.scala:521)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.createQueryStages(AdaptiveSparkPlanExec.scala:521)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:235)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:230)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:372)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:345)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at sun.reflect.GeneratedMethodAccessor71.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [101]:
df.show(n=5, truncate=False)

+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|USMER|MEDICAL_UNIT|PATIENT_TYPE|DATE_DIED |INTUBED|PNEUMONIA|AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|SEX|
+-----+------------+------------+----------+-------+---------+---+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+---+
|2    |1           |2           |09/06/2020|1      |2        |55 |97      |1       |2   |2     |2      |2           |2            |2             |2      |2            |2      |3                   |2  |2  |
|2    |1           |1           |21/06/2020|97     |2        |68 |97      |1       |2   |2     |2      |1           |2            |2             |2      |2            |2      |



In [5]:
df.show(n=5, vertical=True)

-RECORD 0--------------------------
 USMER                | 2          
 MEDICAL_UNIT         | 1          
 PATIENT_TYPE         | 1          
 DATE_DIED            | 03/06/2020 
 INTUBED              | 97         
 PNEUMONIA            | 1          
 AGE                  | 72         
 PREGNANT             | 97         
 DIABETES             | 2          
 COPD                 | 2          
 ASTHMA               | 2          
 INMSUPR              | 2          
 HIPERTENSION         | 1          
 OTHER_DISEASE        | 2          
 CARDIOVASCULAR       | 2          
 OBESITY              | 1          
 RENAL_CHRONIC        | 1          
 TOBACCO              | 2          
 CLASIFFICATION_FINAL | 5          
 ICU                  | 97         
 SEX                  | 2          
-RECORD 1--------------------------
 USMER                | 2          
 MEDICAL_UNIT         | 1          
 PATIENT_TYPE         | 2          
 DATE_DIED            | 09/06/2020 
 INTUBED              | 1   

#### pyspark.sql.DataFrame.stat

In [6]:
df.stat.corr('AGE','DIABETES')

0.004421690181632975

#### pyspark.sql.DataFrame.subtract

In [10]:
df1 = sparkSession.createDataFrame([('nilesh',24),('pawan', 21),('nilesh3',24)])
df2 = sparkSession.createDataFrame([('nilesh2',25),('pawan2', 22),('nilesh3',24)])

In [11]:
df1.subtract(df2).show()



+------+---+
|    _1| _2|
+------+---+
| pawan| 21|
|nilesh| 24|
+------+---+



#### pyspark.sql.DataFrame.summary¶
Computes specified statistics for numeric and string columns. Available statistics are: - count - mean - stddev - min - max - arbitrary approximate percentiles specified as a percentage (e.g., 75%)

    If no statistics are given, this function computes count, mean, stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max.

In [3]:
df = sparkSession.createDataFrame([("Bob", 13, 40.3, 150.5), ("Alice", 12, 37.8, 142.3), ("Tom", 11, 44.1, 142.2)],["name", "age", "weight", "height"])

df.select("age", "weight", "height").summary().show()



+-------+----+------------------+-----------------+
|summary| age|            weight|           height|
+-------+----+------------------+-----------------+
|  count|   3|                 3|                3|
|   mean|12.0|40.733333333333334|            145.0|
| stddev| 1.0| 3.172275734127371|4.763402145525822|
|    min|  11|              37.8|            142.2|
|    25%|  11|              37.8|            142.2|
|    50%|  12|              40.3|            142.3|
|    75%|  13|              44.1|            150.5|
|    max|  13|              44.1|            150.5|
+-------+----+------------------+-----------------+



                                                                                

In [14]:
df.select("age", "weight", "height").summary('max','min').show()

+-------+---+------+------+
|summary|age|weight|height|
+-------+---+------+------+
|    max| 13|  44.1| 150.5|
|    min| 11|  37.8| 142.2|
+-------+---+------+------+



In [15]:
df.select("age", "weight", "height").describe().show()

+-------+----+------------------+-----------------+
|summary| age|            weight|           height|
+-------+----+------------------+-----------------+
|  count|   3|                 3|                3|
|   mean|12.0|40.733333333333334|            145.0|
| stddev| 1.0| 3.172275734127371|4.763402145525822|
|    min|  11|              37.8|            142.2|
|    max|  13|              44.1|            150.5|
+-------+----+------------------+-----------------+



#### pyspark.sql.DataFrame.tail¶
Returns the last num rows as a list of Row.

Running tail requires moving data into the application’s driver process, and doing so with a very large num can crash the driver process with OutOfMemoryError.

In [17]:
df.tail(2)

[Row(name='Alice', age=12, weight=37.8, height=142.3),
 Row(name='Tom', age=11, weight=44.1, height=142.2)]

In [18]:
df.head(2)

[Row(name='Bob', age=13, weight=40.3, height=150.5),
 Row(name='Alice', age=12, weight=37.8, height=142.3)]

##### pyspark.sql.DataFrame.take¶
Returns the first num rows as a list of Row.

In [19]:
df.take(4)

[Row(name='Bob', age=13, weight=40.3, height=150.5),
 Row(name='Alice', age=12, weight=37.8, height=142.3),
 Row(name='Tom', age=11, weight=44.1, height=142.2)]

In [4]:
df.take(2)

[Row(name='Bob', age=13, weight=40.3, height=150.5),
 Row(name='Alice', age=12, weight=37.8, height=142.3)]

#### pyspark.sql.DataFrame.toDF

In [7]:
df.select('age').toDF('AGE').show()

+---+
|AGE|
+---+
| 13|
| 12|
| 11|
+---+



#### pyspark.sql.DataFrame.toJSON
Converts a DataFrame into a RDD of string.

Each row is turned into a JSON document as one element in the returned RDD.

In [9]:
df.toJSON().first()

                                                                                

'{"name":"Bob","age":13,"weight":40.3,"height":150.5}'

In [11]:
df.toJSON().collect()

['{"name":"Bob","age":13,"weight":40.3,"height":150.5}',
 '{"name":"Alice","age":12,"weight":37.8,"height":142.3}',
 '{"name":"Tom","age":11,"weight":44.1,"height":142.2}']

#### pyspark.sql.DataFrame.toPandas

    Returns the contents of this DataFrame as Pandas pandas.DataFrame.

    This is only available if Pandas is installed and available.

    Notes: This method should only be used if the resulting Pandas pandas.DataFrame is expected to be small, as all the data is loaded into the driver’s memory.

In [12]:
df.toPandas()

Unnamed: 0,name,age,weight,height
0,Bob,13,40.3,150.5
1,Alice,12,37.8,142.3
2,Tom,11,44.1,142.2


#### pyspark.sql.DataFrame.transform

In [23]:
def lowerf(df1):
    return df1.select(df1.age+10)

df.transform(lowerf).show()

+----------+
|(age + 10)|
+----------+
|        23|
|        22|
|        21|
+----------+



In [25]:
df.transform(lambda data: data.select(data.height-10)).show()

+-------------+
|(height - 10)|
+-------------+
|        140.5|
|        132.3|
|        132.2|
+-------------+




#### PySpark Union and UnionAll Explained

PySpark union() and unionAll() transformations are used to merge two or more DataFrame’s of the same schema or structure. In this PySpark article, I will explain both union transformations with PySpark examples.

Dataframe union() – union() method of the DataFrame is used to merge two DataFrame’s of the same structure/schema. If schemas are not the same it returns an error.

DataFrame unionAll() – unionAll() is deprecated since Spark “2.0.0” version and replaced with union().

Note: In other SQL languages, Union eliminates the duplicates but UnionAll merges two datasets including duplicate records. But, in PySpark both behave the same and recommend using DataFrame duplicate() function to remove duplicate rows.

In [26]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = sparkSession.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



In [27]:
simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = sparkSession.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [28]:
df.union(df2).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [29]:
df.unionAll(df2).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [30]:
df.union(df2).distinct().show()



+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
+-------------+----------+-----+------+---+-----+



                                                                                

#### pyspark.sql.DataFrame.unionByName

    Returns a new DataFrame containing union of rows in this and another DataFrame.

    This is different from both UNION ALL and UNION DISTINCT in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by distinct().

    Examples

    The difference between this function and union() is that this function resolves columns by name (not by position):

In [32]:
df1 = sparkSession.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = sparkSession.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
df1.unionByName(df2).show()

+----+----+----+
|col0|col1|col2|
+----+----+----+
|   1|   2|   3|
|   6|   4|   5|
+----+----+----+



When the parameter allowMissingColumns is True, the set of column names in this and other DataFrame can differ; missing columns will be filled with null. Further, the missing columns of this DataFrame will be added at the end in the schema of the union result:

In [34]:
df1 = sparkSession.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
df2 = sparkSession.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])
df1.unionByName(df2, allowMissingColumns=True).show()

+----+----+----+----+
|col0|col1|col2|col3|
+----+----+----+----+
|   1|   2|   3|null|
|null|   4|   5|   6|
+----+----+----+----+



#### pyspark.sql.DataFrame.unpersist
Marks the DataFrame as non-persistent, and remove all blocks for it from memory and disk.

In [35]:
df.unpersist()

DataFrame[employee_name: string, department: string, state: string, salary: bigint, age: bigint, bonus: bigint]

#### pyspark.sql.DataFrame.withColumn

Returns a new DataFrame by adding a column or replacing the existing column that has the same name.

The column expression must be an expression over this DataFrame; attempting to add a column from some other DataFrame will raise an error.

    Parameters

        col Name str

            string, name of the new column.
        col Column

            a Column expression for the new column.

Notes: This method introduces a projection internally. Therefore, calling it multiple times, for instance, via loops in order to add multiple columns can generate big plans which can cause performance issues and even StackOverflowException. To avoid this, use select() with the multiple columns at once.

In [36]:
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+



In [37]:
df.withColumn('total_salary', df.salary+df.bonus).show()

+-------------+----------+-----+------+---+-----+------------+
|employee_name|department|state|salary|age|bonus|total_salary|
+-------------+----------+-----+------+---+-----+------------+
|        James|     Sales|   NY| 90000| 34|10000|      100000|
|      Michael|     Sales|   NY| 86000| 56|20000|      106000|
|       Robert|     Sales|   CA| 81000| 30|23000|      104000|
|        Maria|   Finance|   CA| 90000| 24|23000|      113000|
+-------------+----------+-----+------+---+-----+------------+



In [39]:
df.withColumns({'net_income':df.salary+df.bonus-5000, 'total_salary': df.salary+df.bonus}).show()

+-------------+----------+-----+------+---+-----+----------+------------+
|employee_name|department|state|salary|age|bonus|net_income|total_salary|
+-------------+----------+-----+------+---+-----+----------+------------+
|        James|     Sales|   NY| 90000| 34|10000|     95000|      100000|
|      Michael|     Sales|   NY| 86000| 56|20000|    101000|      106000|
|       Robert|     Sales|   CA| 81000| 30|23000|     99000|      104000|
|        Maria|   Finance|   CA| 90000| 24|23000|    108000|      113000|
+-------------+----------+-----+------+---+-----+----------+------------+



#### pyspark.sql.DataFrame.withColumnRenamed
Returns a new DataFrame by renaming an existing column. This is a no-op if schema doesn’t contain the given column name.

In [40]:
df.withColumnRenamed('department','Dept').show()

+-------------+-------+-----+------+---+-----+
|employee_name|   Dept|state|salary|age|bonus|
+-------------+-------+-----+------+---+-----+
|        James|  Sales|   NY| 90000| 34|10000|
|      Michael|  Sales|   NY| 86000| 56|20000|
|       Robert|  Sales|   CA| 81000| 30|23000|
|        Maria|Finance|   CA| 90000| 24|23000|
+-------------+-------+-----+------+---+-----+

