In [93]:
import copy
import findspark
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, MapType
from pyspark.sql.functions import col, lit, explode, split , array, array_contains, udf, map_values, when, count, min, max, avg

findspark.init()

In [2]:
spark = SparkSession.builder.appName('Spark SQL').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/27 09:29:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
left_df = spark.createDataFrame(
    data=[(1,'ali'),(2, 'akram'),(3, 'oveys'),(4, 'ala'),(5, 'omid'),(6, 'mobin')], schema=['id', 'name']
)

right_df = spark.createDataFrame(
        data=[(8,'mehdi'),(9, 'simin'),(3, 'oveys'),(4, 'ala'),(10, 'aida')], schema=['id', 'name']
)

In [4]:
# options are inner, outer, left, right
left_df.join(right_df, on=['id'], how='left').show()

                                                                                

+---+-----+-----+
| id| name| name|
+---+-----+-----+
|  1|  ali| NULL|
|  2|akram| NULL|
|  3|oveys|oveys|
|  4|  ala|  ala|
|  5| omid| NULL|
|  6|mobin| NULL|
+---+-----+-----+



In [5]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
])

rdd1 = spark.sparkContext.parallelize([(1,'a'),(2, 'b')])
rdd2 = spark.sparkContext.parallelize([(1,'d'),(3, 'b')])
rdd_res = rdd1.union(rdd2)

spark.createDataFrame(rdd_res, schema).show()



+---+----+
| id|name|
+---+----+
|  1|   a|
|  2|   b|
|  1|   d|
|  3|   b|
+---+----+



In [6]:
titanic_shcema = StructType()\
    .add('pid', 'integer') \
    .add('survived', 'integer') \
    .add('class', 'integer') \
    .add('name', 'string') \
    .add('sex', 'string') \
    .add('age', 'integer') \
    .add('sib', 'integer') \
    .add('parch', 'integer') \
    .add('tikcet', 'string') \
    .add('fare', 'float') \
    .add('cabin', 'string') \
    .add('embarked', 'string') \



titanic_df = spark.read.option('header', 'true').schema(titanic_shcema).csv('titanic.csv')
titanic_df.show()

+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|pid|survived|class|                name|   sex| age|sib|parch|          tikcet|   fare|cabin|embarked|
+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|  1|       0|    3|Braund, Mr. Owen ...|  male|  22|  1|    0|       A/5 21171|   7.25| NULL|       S|
|  2|       1|    1|Cumings, Mrs. Joh...|female|  38|  1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|    3|Heikkinen, Miss. ...|female|  26|  0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|  4|       1|    1|Futrelle, Mrs. Ja...|female|  35|  1|    0|          113803|   53.1| C123|       S|
|  5|       0|    3|Allen, Mr. Willia...|  male|  35|  0|    0|          373450|   8.05| NULL|       S|
|  6|       0|    3|    Moran, Mr. James|  male|NULL|  0|    0|          330877| 8.4583| NULL|       Q|
|  7|       0|    1|McCarthy, Mr. Tim...|  male|  54|  0|    0| 

24/03/27 09:29:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


In [7]:
titanic_1 = titanic_df.alias('titanic_1')

titanic_1.show()

+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|pid|survived|class|                name|   sex| age|sib|parch|          tikcet|   fare|cabin|embarked|
+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|  1|       0|    3|Braund, Mr. Owen ...|  male|  22|  1|    0|       A/5 21171|   7.25| NULL|       S|
|  2|       1|    1|Cumings, Mrs. Joh...|female|  38|  1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|    3|Heikkinen, Miss. ...|female|  26|  0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|  4|       1|    1|Futrelle, Mrs. Ja...|female|  35|  1|    0|          113803|   53.1| C123|       S|
|  5|       0|    3|Allen, Mr. Willia...|  male|  35|  0|    0|          373450|   8.05| NULL|       S|
|  6|       0|    3|    Moran, Mr. James|  male|NULL|  0|    0|          330877| 8.4583| NULL|       Q|
|  7|       0|    1|McCarthy, Mr. Tim...|  male|  54|  0|    0| 

24/03/27 09:29:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


In [8]:
titanic_1.select('cabin').show()

+-----+
|cabin|
+-----+
| NULL|
|  C85|
| NULL|
| C123|
| NULL|
| NULL|
|  E46|
| NULL|
| NULL|
| NULL|
|   G6|
| C103|
| NULL|
| NULL|
| NULL|
| NULL|
| NULL|
| NULL|
| NULL|
| NULL|
+-----+
only showing top 20 rows



In [9]:
titanic_1.filter(titanic_1.age > 35).count()

210

In [10]:
titanic_1.select('name', 'age', 'survived').filter('survived = 0').show()

+--------------------+----+--------+
|                name| age|survived|
+--------------------+----+--------+
|Braund, Mr. Owen ...|  22|       0|
|Allen, Mr. Willia...|  35|       0|
|    Moran, Mr. James|NULL|       0|
|McCarthy, Mr. Tim...|  54|       0|
|Palsson, Master. ...|   2|       0|
|Saundercock, Mr. ...|  20|       0|
|Andersson, Mr. An...|  39|       0|
|Vestrom, Miss. Hu...|  14|       0|
|Rice, Master. Eugene|   2|       0|
|Vander Planke, Mr...|  31|       0|
|Fynney, Mr. Joseph J|  35|       0|
|Palsson, Miss. To...|   8|       0|
|Emir, Mr. Farred ...|NULL|       0|
|Fortune, Mr. Char...|  19|       0|
| Todoroff, Mr. Lalio|NULL|       0|
|Uruchurtu, Don. M...|  40|       0|
|Wheadon, Mr. Edwa...|  66|       0|
|Meyer, Mr. Edgar ...|  28|       0|
|Holverson, Mr. Al...|  42|       0|
|Cann, Mr. Ernest ...|  21|       0|
+--------------------+----+--------+
only showing top 20 rows



In [11]:
titanic_1.select('name', 'survived').where('age < 30 and survived=1').show()

+--------------------+--------+
|                name|survived|
+--------------------+--------+
|Heikkinen, Miss. ...|       1|
|Johnson, Mrs. Osc...|       1|
|Nasser, Mrs. Nich...|       1|
|Sandstrom, Miss. ...|       1|
|"McGowan, Miss. A...|       1|
|Sloper, Mr. Willi...|       1|
|Nicola-Yarred, Mi...|       1|
|Laroche, Miss. Si...|       1|
|Devaney, Miss. Ma...|       1|
|Faunthorpe, Mrs. ...|       1|
|   Rugg, Miss. Emily|       1|
|West, Miss. Const...|       1|
|Nye, Mrs. (Elizab...|       1|
|Andersson, Miss. ...|       1|
|Sheerlinck, Mr. J...|       1|
| Ilett, Miss. Bertha|       1|
|Fortune, Miss. Ma...|       1|
|Greenfield, Mr. W...|       1|
|Salkjelsvik, Miss...|       1|
|Nicola-Yarred, Ma...|       1|
+--------------------+--------+
only showing top 20 rows



In [12]:
titanic_1.sort(['age', 'name'], ascending=[False, True]).show()

+---+--------+-----+--------------------+------+---+---+-----+-----------+-------+-----------+--------+
|pid|survived|class|                name|   sex|age|sib|parch|     tikcet|   fare|      cabin|embarked|
+---+--------+-----+--------------------+------+---+---+-----+-----------+-------+-----------+--------+
|631|       1|    1|Barkworth, Mr. Al...|  male| 80|  0|    0|      27042|   30.0|        A23|       S|
|852|       0|    3| Svensson, Mr. Johan|  male| 74|  0|    0|     347060|  7.775|       NULL|       S|
|494|       0|    1|Artagaveytia, Mr....|  male| 71|  0|    0|   PC 17609|49.5042|       NULL|       C|
| 97|       0|    1|Goldschmidt, Mr. ...|  male| 71|  0|    0|   PC 17754|34.6542|         A5|       C|
|746|       0|    1|Crosby, Capt. Edw...|  male| 70|  1|    1|  WE/P 5735|   71.0|        B22|       S|
|673|       0|    2|Mitchell, Mr. Hen...|  male| 70|  0|    0| C.A. 24580|   10.5|       NULL|       S|
| 34|       0|    2|Wheadon, Mr. Edwa...|  male| 66|  0|    0| C

24/03/27 09:29:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


In [13]:
titanic_1.describe().show()

24/03/27 09:29:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/03/27 09:29:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv
[Stage 18:>                                                         (0 + 1) / 1]

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|              pid|           survived|             class|                name|   sex|               age|               sib|              parch|            tikcet|             fare|cabin|embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               689|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL|29.847605224963715|0.5230078563411896|0.38159371492704824|260318.54916792738|32.204208

                                                                                

In [14]:
titanic_1.columns

['pid',
 'survived',
 'class',
 'name',
 'sex',
 'age',
 'sib',
 'parch',
 'tikcet',
 'fare',
 'cabin',
 'embarked']

In [16]:
titanic_1.printSchema()

root
 |-- pid: integer (nullable = true)
 |-- survived: integer (nullable = true)
 |-- class: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- sib: integer (nullable = true)
 |-- parch: integer (nullable = true)
 |-- tikcet: string (nullable = true)
 |-- fare: float (nullable = true)
 |-- cabin: string (nullable = true)
 |-- embarked: string (nullable = true)



In [19]:
help(titanic_1.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName: str, col: pyspark.sql.column.Column) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Returns
    -------
    :class:`DataFrame`
        DataFrame with new or replaced column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to 

# withColumn()

##### It's a transformation operation which is used for 
##### 1. adding a new column
##### 2. changing the dtype of an existing column
##### 3. changing the value of an existing column

In [26]:
# change dtype
titanic_1.withColumn(colName='sib', col=titanic_1.sib.cast('float')).show()

+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|pid|survived|class|                name|   sex| age|sib|parch|          tikcet|   fare|cabin|embarked|
+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|  1|       0|    3|Braund, Mr. Owen ...|  male|  22|1.0|    0|       A/5 21171|   7.25| NULL|       S|
|  2|       1|    1|Cumings, Mrs. Joh...|female|  38|1.0|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|    3|Heikkinen, Miss. ...|female|  26|0.0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|  4|       1|    1|Futrelle, Mrs. Ja...|female|  35|1.0|    0|          113803|   53.1| C123|       S|
|  5|       0|    3|Allen, Mr. Willia...|  male|  35|0.0|    0|          373450|   8.05| NULL|       S|
|  6|       0|    3|    Moran, Mr. James|  male|NULL|0.0|    0|          330877| 8.4583| NULL|       Q|
|  7|       0|    1|McCarthy, Mr. Tim...|  male|  54|0.0|    0| 

24/03/27 09:45:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


In [27]:
# data manipulation
titanic_1.withColumn('sib', titanic_1.sib * 1).show()

+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|pid|survived|class|                name|   sex| age|sib|parch|          tikcet|   fare|cabin|embarked|
+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|  1|       0|    3|Braund, Mr. Owen ...|  male|  22|  1|    0|       A/5 21171|   7.25| NULL|       S|
|  2|       1|    1|Cumings, Mrs. Joh...|female|  38|  1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|    3|Heikkinen, Miss. ...|female|  26|  0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|  4|       1|    1|Futrelle, Mrs. Ja...|female|  35|  1|    0|          113803|   53.1| C123|       S|
|  5|       0|    3|Allen, Mr. Willia...|  male|  35|  0|    0|          373450|   8.05| NULL|       S|
|  6|       0|    3|    Moran, Mr. James|  male|NULL|  0|    0|          330877| 8.4583| NULL|       Q|
|  7|       0|    1|McCarthy, Mr. Tim...|  male|  54|  0|    0| 

24/03/27 09:45:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


In [28]:
# creating new col
titanic_1.withColumn(colName='nationality', col=lit('XYZ')).show()

+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+-----------+
|pid|survived|class|                name|   sex| age|sib|parch|          tikcet|   fare|cabin|embarked|nationality|
+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+-----------+
|  1|       0|    3|Braund, Mr. Owen ...|  male|  22|  1|    0|       A/5 21171|   7.25| NULL|       S|        XYZ|
|  2|       1|    1|Cumings, Mrs. Joh...|female|  38|  1|    0|        PC 17599|71.2833|  C85|       C|        XYZ|
|  3|       1|    3|Heikkinen, Miss. ...|female|  26|  0|    0|STON/O2. 3101282|  7.925| NULL|       S|        XYZ|
|  4|       1|    1|Futrelle, Mrs. Ja...|female|  35|  1|    0|          113803|   53.1| C123|       S|        XYZ|
|  5|       0|    3|Allen, Mr. Willia...|  male|  35|  0|    0|          373450|   8.05| NULL|       S|        XYZ|
|  6|       0|    3|    Moran, Mr. James|  male|NULL|  0|    0|         

24/03/27 09:45:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


# withColumnRename()

In [31]:
titanic_1.withColumnRenamed(existing='tikcet', new='pnr').show()

+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|pid|survived|class|                name|   sex| age|sib|parch|             pnr|   fare|cabin|embarked|
+---+--------+-----+--------------------+------+----+---+-----+----------------+-------+-----+--------+
|  1|       0|    3|Braund, Mr. Owen ...|  male|  22|  1|    0|       A/5 21171|   7.25| NULL|       S|
|  2|       1|    1|Cumings, Mrs. Joh...|female|  38|  1|    0|        PC 17599|71.2833|  C85|       C|
|  3|       1|    3|Heikkinen, Miss. ...|female|  26|  0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|  4|       1|    1|Futrelle, Mrs. Ja...|female|  35|  1|    0|          113803|   53.1| C123|       S|
|  5|       0|    3|Allen, Mr. Willia...|  male|  35|  0|    0|          373450|   8.05| NULL|       S|
|  6|       0|    3|    Moran, Mr. James|  male|NULL|  0|    0|          330877| 8.4583| NULL|       Q|
|  7|       0|    1|McCarthy, Mr. Tim...|  male|  54|  0|    0| 

24/03/27 09:50:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
 Schema: pid, survived, class, name, sex, age, sib, parch, tikcet, fare, cabin, embarked
Expected: pid but found: PassengerId
CSV file: file:///Users/hso/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/src/titanic.csv


# explode()

In [33]:
data = [(1, 'Oveys', ['java', 'python']), (2, 'Mohammad', ['Java', 'Python', 'Docker']), (3, 'Ala',['ObjectiveC', 'Swift', 'swiftUI', 'iOS'])]
devs = spark.createDataFrame(data, ['id', 'name', 'skills'])
devs.show()
devs.printSchema()

+---+--------+--------------------+
| id|    name|              skills|
+---+--------+--------------------+
|  1|   Oveys|      [java, python]|
|  2|Mohammad|[Java, Python, Do...|
|  3|     Ala|[ObjectiveC, Swif...|
+---+--------+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [36]:
devs.withColumn('skills', explode(devs.skills)).show()

+---+--------+----------+
| id|    name|    skills|
+---+--------+----------+
|  1|   Oveys|      java|
|  1|   Oveys|    python|
|  2|Mohammad|      Java|
|  2|Mohammad|    Python|
|  2|Mohammad|    Docker|
|  3|     Ala|ObjectiveC|
|  3|     Ala|     Swift|
|  3|     Ala|   swiftUI|
|  3|     Ala|       iOS|
+---+--------+----------+



# split()

In [38]:
data = [(1, 'Oveys', 'java,python'), (2, 'Mohammad', 'Java,Python,Docker'), (3, 'Ala','ObjectiveC,Swift,swiftUI,iOS')]
devs = spark.createDataFrame(data, ['id', 'name', 'skills'])
devs.show()
devs.printSchema()

+---+--------+--------------------+
| id|    name|              skills|
+---+--------+--------------------+
|  1|   Oveys|         java,python|
|  2|Mohammad|  Java,Python,Docker|
|  3|     Ala|ObjectiveC,Swift,...|
+---+--------+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: string (nullable = true)



In [39]:
devs.withColumn('skills', split('skills', ',')).show()

+---+--------+--------------------+
| id|    name|              skills|
+---+--------+--------------------+
|  1|   Oveys|      [java, python]|
|  2|Mohammad|[Java, Python, Do...|
|  3|     Ala|[ObjectiveC, Swif...|
+---+--------+--------------------+



# array()

In [43]:
data = [(1, 'Oveys', 'java','python'), (2, 'Mohammad', 'Java','Docker'), (3, 'Ala','ObjectiveC','Swift')]
devs = spark.createDataFrame(data, ['id', 'name', 'skill_1', 'skill_2'])
devs.show()
devs.printSchema()

+---+--------+----------+-------+
| id|    name|   skill_1|skill_2|
+---+--------+----------+-------+
|  1|   Oveys|      java| python|
|  2|Mohammad|      Java| Docker|
|  3|     Ala|ObjectiveC|  Swift|
+---+--------+----------+-------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skill_1: string (nullable = true)
 |-- skill_2: string (nullable = true)



In [44]:
devs.withColumn('skills', array('skill_1', 'skill_2')).show()

+---+--------+----------+-------+-------------------+
| id|    name|   skill_1|skill_2|             skills|
+---+--------+----------+-------+-------------------+
|  1|   Oveys|      java| python|     [java, python]|
|  2|Mohammad|      Java| Docker|     [Java, Docker]|
|  3|     Ala|ObjectiveC|  Swift|[ObjectiveC, Swift]|
+---+--------+----------+-------+-------------------+



# array_contains()

In [45]:
data = [(1, 'Oveys', ['java', 'python']), (2, 'Mohammad', ['Java', 'Python', 'Docker']), (3, 'Ala',['ObjectiveC', 'Swift', 'swiftUI', 'iOS'])]
devs = spark.createDataFrame(data, ['id', 'name', 'skills'])
devs.show()
devs.printSchema()

+---+--------+--------------------+
| id|    name|              skills|
+---+--------+--------------------+
|  1|   Oveys|      [java, python]|
|  2|Mohammad|[Java, Python, Do...|
|  3|     Ala|[ObjectiveC, Swif...|
+---+--------+--------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [50]:
def lower_skills(skills):
    return [skill.lower() for skill in skills]


lower_skills_udf = udf(lower_skills, ArrayType(StringType()))

devs.withColumn('skills', lower_skills_udf('skills')).withColumn('pythonist', array_contains('skills', 'python')).show()

                                                                                

+---+--------+--------------------+---------+
| id|    name|              skills|pythonist|
+---+--------+--------------------+---------+
|  1|   Oveys|      [java, python]|     true|
|  2|Mohammad|[java, python, do...|     true|
|  3|     Ala|[objectivec, swif...|    false|
+---+--------+--------------------+---------+



# MapType() and explode() on it:

In [53]:
data = [(1, 'Oveys', {'age':31, 'height':175}), (2, 'Mohammad', {'age':32, 'height':165})]
devs = spark.createDataFrame(data, ['id', 'name', 'properties'])
devs.show(truncate=False)
devs.printSchema()

+---+--------+--------------------------+
|id |name    |properties                |
+---+--------+--------------------------+
|1  |Oveys   |{age -> 31, height -> 175}|
|2  |Mohammad|{age -> 32, height -> 165}|
+---+--------+--------------------------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: long (valueContainsNull = true)



In [54]:
devs.select('id', 'name', 'properties', explode('properties')).show(truncate=False)

+---+--------+--------------------------+------+-----+
|id |name    |properties                |key   |value|
+---+--------+--------------------------+------+-----+
|1  |Oveys   |{age -> 31, height -> 175}|age   |31   |
|1  |Oveys   |{age -> 31, height -> 175}|height|175  |
|2  |Mohammad|{age -> 32, height -> 165}|age   |32   |
|2  |Mohammad|{age -> 32, height -> 165}|height|165  |
+---+--------+--------------------------+------+-----+



In [57]:
devs.withColumn('values', map_values('properties')).show(truncate=False)

+---+--------+--------------------------+---------+
|id |name    |properties                |values   |
+---+--------+--------------------------+---------+
|1  |Oveys   |{age -> 31, height -> 175}|[31, 175]|
|2  |Mohammad|{age -> 32, height -> 165}|[32, 165]|
+---+--------+--------------------------+---------+



# when() otherwise()

In [58]:
data = [(1, 'Oveys', 'm'), (2, 'Mohammad', 'm'), (3, 'Ala','f')]
devs = spark.createDataFrame(data, ['id', 'name', 'gender'])
devs.show()
devs.printSchema()

+---+--------+------+
| id|    name|gender|
+---+--------+------+
|  1|   Oveys|     m|
|  2|Mohammad|     m|
|  3|     Ala|     f|
+---+--------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)



In [62]:
devs.select(
    'id',
    'name',
    when(devs.gender=='m', value=True) \
    .when(devs.gender=='f', value=False) \
    .otherwise(value=None).alias('sex')
).show()

+---+--------+-----+
| id|    name|  sex|
+---+--------+-----+
|  1|   Oveys| true|
|  2|Mohammad| true|
|  3|     Ala|false|
+---+--------+-----+



In [63]:
devs.withColumn(
    'gender',
    when(devs.gender=='m', value=True) \
    .when(devs.gender=='f', value=False) \
    .otherwise(value=None).alias('sex')
).show()

+---+--------+------+
| id|    name|gender|
+---+--------+------+
|  1|   Oveys|  true|
|  2|Mohammad|  true|
|  3|     Ala| false|
+---+--------+------+



# column's functions

In [64]:
data = [(1, 'Oveys', 4000), (2, 'Mohammad', 5000), (3, 'Ala',5500)]
devs = spark.createDataFrame(data, ['id', 'name', 'salary'])
devs.show()
devs.printSchema()

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  1|   Oveys|  4000|
|  2|Mohammad|  5000|
|  3|     Ala|  5500|
+---+--------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [66]:
# alias()
devs.select(devs.id.alias('emp_id'), devs.name, devs.salary).show()

+------+--------+------+
|emp_id|    name|salary|
+------+--------+------+
|     1|   Oveys|  4000|
|     2|Mohammad|  5000|
|     3|     Ala|  5500|
+------+--------+------+



In [67]:
# asc() and desc()
devs.sort(devs.salary.asc()).show()
devs.sort(devs.salary.desc()).show()

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  1|   Oveys|  4000|
|  2|Mohammad|  5000|
|  3|     Ala|  5500|
+---+--------+------+

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  3|     Ala|  5500|
|  2|Mohammad|  5000|
|  1|   Oveys|  4000|
+---+--------+------+



In [68]:
# cast()
devs.select('id', 'name', devs.salary.cast('float')).show()

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  1|   Oveys|4000.0|
|  2|Mohammad|5000.0|
|  3|     Ala|5500.0|
+---+--------+------+



In [69]:
# like()
devs.filter(devs.name.like('A%')).show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  3| Ala|  5500|
+---+----+------+



# filter() and where()

In [70]:
data = [(1, 'Oveys', 4000), (2, 'Mohammad', 5000), (3, 'Ala',5500)]
devs = spark.createDataFrame(data, ['id', 'name', 'salary'])
devs.show()
devs.printSchema()

+---+--------+------+
| id|    name|salary|
+---+--------+------+
|  1|   Oveys|  4000|
|  2|Mohammad|  5000|
|  3|     Ala|  5500|
+---+--------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [71]:
devs.filter('salary == 4000').show()

+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Oveys|  4000|
+---+-----+------+



In [72]:
devs.filter(devs.salary == 4000).show()


+---+-----+------+
| id| name|salary|
+---+-----+------+
|  1|Oveys|  4000|
+---+-----+------+



In [74]:
devs.where((devs.salary >= 5000) & (devs.name.like('A%'))).show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  3| Ala|  5500|
+---+----+------+



# distinct() & drop_duplicates()

In [79]:
data = [(1, 'Oveys','M', 4000), (2, 'Mohammad', 'M', 5000), (2, 'Mohammad', 'M', 5000), (3, 'Ala', 'F', 5500)]
devs = spark.createDataFrame(data, ['id', 'name', 'gender', 'salary'])
devs.show()
devs.printSchema()

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|   Oveys|     M|  4000|
|  2|Mohammad|     M|  5000|
|  2|Mohammad|     M|  5000|
|  3|     Ala|     F|  5500|
+---+--------+------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [80]:
devs.distinct().show()

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|   Oveys|     M|  4000|
|  2|Mohammad|     M|  5000|
|  3|     Ala|     F|  5500|
+---+--------+------+------+



In [81]:
# drops if there is a duplicate in rows

devs.drop_duplicates().show()

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|   Oveys|     M|  4000|
|  2|Mohammad|     M|  5000|
|  3|     Ala|     F|  5500|
+---+--------+------+------+



In [82]:
# drops if there is a duplicate in gender column

devs.drop_duplicates(subset=['gender']).show()

+---+-----+------+------+
| id| name|gender|salary|
+---+-----+------+------+
|  3|  Ala|     F|  5500|
|  1|Oveys|     M|  4000|
+---+-----+------+------+



# union()

In [86]:
data = [(1, 'Oveys','M', 4000), (2, 'Mohammad', 'M', 5000), (2, 'Mohammad', 'M', 5000), (3, 'Ala', 'F', 5500)]
devs1 = spark.createDataFrame(data, ['id', 'name', 'gender', 'salary'])
devs.show()
devs.printSchema()

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|   Oveys|     M|  4000|
|  2|Mohammad|     M|  5000|
|  2|Mohammad|     M|  5000|
|  3|     Ala|     F|  5500|
+---+--------+------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [87]:
data = [(1, 'Ali','M', 4000), (2, 'Jafar', 'M', 5000), (2, 'Mohammad', 'M', 5000), (2, 'Abas', 'M', 5000), (3, 'Mona', 'F', 5500)]
devs2 = spark.createDataFrame(data, ['id', 'name', 'gender', 'salary'])
devs2.show()
devs2.printSchema()

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|     Ali|     M|  4000|
|  2|   Jafar|     M|  5000|
|  2|Mohammad|     M|  5000|
|  2|    Abas|     M|  5000|
|  3|    Mona|     F|  5500|
+---+--------+------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [89]:
devs1.union(devs2).distinct().show()



+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|   Oveys|     M|  4000|
|  2|Mohammad|     M|  5000|
|  3|     Ala|     F|  5500|
|  1|     Ali|     M|  4000|
|  2|   Jafar|     M|  5000|
|  2|    Abas|     M|  5000|
|  3|    Mona|     F|  5500|
+---+--------+------+------+



                                                                                

# groupBy() and agg()

In [92]:
data = [
    (1, 'Ali','M', 4000),
    (2, 'Jafar', 'M', 5000),
    (7, 'Mohammad', 'M', 5700),
    (2, 'Abas', 'M', 6500),
    (3, 'Mona', 'F', 6200)]
devs = spark.createDataFrame(data, ['id', 'name', 'gender', 'salary'])
devs.show()
devs.printSchema()

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|     Ali|     M|  4000|
|  2|   Jafar|     M|  5000|
|  7|Mohammad|     M|  5700|
|  2|    Abas|     M|  6500|
|  3|    Mona|     F|  6200|
+---+--------+------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [94]:
devs.groupby(devs.gender).agg(
    count('*').alias('emp_count'),
    min('salary').alias('min_salary'),
    max('salary').alias('max_salary'),
    avg('salary').alias('avg_salary')
).show()

+------+---------+----------+----------+----------+
|gender|emp_count|min_salary|max_salary|avg_salary|
+------+---------+----------+----------+----------+
|     M|        4|      4000|      6500|    5300.0|
|     F|        1|      6200|      6200|    6200.0|
+------+---------+----------+----------+----------+



# unionByName()

In [95]:
data = [(1, 'Ali',20, 4000), (2, 'Jafar', 21, 5000), (2, 'Mohammad', 23, 5000), (2, 'Abas', 45, 5000), (3, 'Mona', 19, 5500)]
devs1 = spark.createDataFrame(data, ['id', 'name', 'age', 'salary'])
devs1.show()
devs1.printSchema()

data = [(1, 'Ali','M', 4000), (2, 'Jafar', 'M', 5000), (2, 'Mohammad', 'M', 5000), (2, 'Abas', 'M', 5000), (3, 'Mona', 'F', 5500)]
devs2 = spark.createDataFrame(data, ['id', 'name', 'gender', 'salary'])
devs2.show()
devs2.printSchema()

+---+--------+---+------+
| id|    name|age|salary|
+---+--------+---+------+
|  1|     Ali| 20|  4000|
|  2|   Jafar| 21|  5000|
|  2|Mohammad| 23|  5000|
|  2|    Abas| 45|  5000|
|  3|    Mona| 19|  5500|
+---+--------+---+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)

+---+--------+------+------+
| id|    name|gender|salary|
+---+--------+------+------+
|  1|     Ali|     M|  4000|
|  2|   Jafar|     M|  5000|
|  2|Mohammad|     M|  5000|
|  2|    Abas|     M|  5000|
|  3|    Mona|     F|  5500|
+---+--------+------+------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [98]:
devs1.unionByName(devs2, allowMissingColumns=True).show()

+---+--------+----+------+------+
| id|    name| age|salary|gender|
+---+--------+----+------+------+
|  1|     Ali|  20|  4000|  NULL|
|  2|   Jafar|  21|  5000|  NULL|
|  2|Mohammad|  23|  5000|  NULL|
|  2|    Abas|  45|  5000|  NULL|
|  3|    Mona|  19|  5500|  NULL|
|  1|     Ali|NULL|  4000|     M|
|  2|   Jafar|NULL|  5000|     M|
|  2|Mohammad|NULL|  5000|     M|
|  2|    Abas|NULL|  5000|     M|
|  3|    Mona|NULL|  5500|     F|
+---+--------+----+------+------+



# self join

In [105]:
data = [(1, 'Ali',0), (2, 'Jafar', 1), (3, 'Mohammad', 1), (4, 'Abas', 3), (5, 'Mona', 2)]
devs = spark.createDataFrame(data, ['id', 'name', 'manager'])
devs.show()
devs.printSchema()

+---+--------+-------+
| id|    name|manager|
+---+--------+-------+
|  1|     Ali|      0|
|  2|   Jafar|      1|
|  3|Mohammad|      1|
|  4|    Abas|      3|
|  5|    Mona|      2|
+---+--------+-------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- manager: long (nullable = true)



In [107]:
devs.alias('emp').join(
    devs.alias('mngr'),
    col('emp.manager') == col('mngr.id'),
    how='left'
).show()

[Stage 199:>                                                        (0 + 8) / 8]

+---+--------+-------+----+--------+-------+
| id|    name|manager|  id|    name|manager|
+---+--------+-------+----+--------+-------+
|  1|     Ali|      0|NULL|    NULL|   NULL|
|  2|   Jafar|      1|   1|     Ali|      0|
|  3|Mohammad|      1|   1|     Ali|      0|
|  4|    Abas|      3|   3|Mohammad|      1|
|  5|    Mona|      2|   2|   Jafar|      1|
+---+--------+-------+----+--------+-------+



                                                                                

# pivot()

In [116]:
data = [(1, 'Ali','M', 'IT'), (2, 'Jafar', 'F', 'HR'), (3, 'Mohammad', 'M', 'HR'), (4, 'Abas', 'F', 'IT'), (5, 'Mona', 'T', 'FIN')]
devs = spark.createDataFrame(data, ['id', 'name', 'gender', 'department'])
devs.show()
devs.printSchema()

+---+--------+------+----------+
| id|    name|gender|department|
+---+--------+------+----------+
|  1|     Ali|     M|        IT|
|  2|   Jafar|     F|        HR|
|  3|Mohammad|     M|        HR|
|  4|    Abas|     F|        IT|
|  5|    Mona|     T|       FIN|
+---+--------+------+----------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- department: string (nullable = true)



In [117]:
devs.groupBy('department').pivot('gender').count().show()

+----------+----+----+----+
|department|   F|   M|   T|
+----------+----+----+----+
|        HR|   1|   1|NULL|
|       FIN|NULL|NULL|   1|
|        IT|   1|   1|NULL|
+----------+----+----+----+



In [119]:
# we can choose which value of the pivots can be showed as columns
devs.groupBy('department').pivot('gender', ['M', 'F']).count().show()


+----------+----+----+
|department|   M|   F|
+----------+----+----+
|        HR|   1|   1|
|       FIN|NULL|NULL|
|        IT|   1|   1|
+----------+----+----+



24/03/27 17:46:08 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1748627 ms exceeds timeout 120000 ms
24/03/27 17:46:08 WARN SparkContext: Killing executors is not supported by current scheduler.
24/03/27 17:46:08 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [34]:
# titanic_1.fillna(value='GG', subset='cabin').show()