# PySpark DataFrame Manipulation

### Imports

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

# encoding categorical data
import pyspark.sql.functions as F 
from pyspark.sql import functions as f
from pyspark.sql import types as t

### Start the PySpark Session

In [2]:
sc = SparkContext()

In [3]:
spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

### Lets make Pyspark Dataframe from titanic csv file 

In [4]:
df = spark.read.csv('titanic.csv',header=True, inferSchema = True)

### Look at the First 5 Rows

In [5]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

### Describe the Dataset

In [6]:
df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                null|  null| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

*From the original assignment in DS 1.1:*

VARIABLE DESCRIPTIONS:
survival        Survival
                (0 = No; 1 = Yes)
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
name            Name
sex             Sex
age             Age
sibsp           Number of Siblings/Spouses Aboard
parch           Number of Parents/Children Aboard
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)

In [7]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



### How many of Age values are empty (or null)?

For this question, I used values provided by table in the `df.describe.show()` cell above.

In [8]:
num_rows_overall = 891
num_values_in_age = 714
num_null_in_age = num_rows_overall - num_values_in_age
print(f'The number of null values in the Age column: {num_null_in_age}.')

The number of null values in the Age column: 177.


### Create a new column as gender, when Sex is female it is zero when sex is male it is one

In [9]:
"""def sex_to_gender(x):
    '''Return a binary representation of gender in the dataset'''
    # define the mapping between categorical and integer values
    genders = sc.broadcast(dict([('male', 1), ('female',0)]))
    return genders.value[x]

new_gender_values = F.udf(sex_to_gender, t.StringType())

df.withColumn('Gender', sex_to_gender(F.col('Sex'))).show(truncate=False)"""

"def sex_to_gender(x):\n    '''Return a binary representation of gender in the dataset'''\n    # define the mapping between categorical and integer values\n    genders = sc.broadcast(dict([('male', 1), ('female',0)]))\n    return genders.value[x]\n\nnew_gender_values = F.udf(sex_to_gender, t.StringType())\n\ndf.withColumn('Gender', sex_to_gender(F.col('Sex'))).show(truncate=False)"

In [10]:
# first, make a 
new_df = df.withColumn('Gender', df.Sex)

In [11]:
# new_df.Gender.rdd.map(lambda x: 2 if x == 'male' else 0).collect()

In [12]:
"""def binary_encode_sex(value):
    if value == 'male':
        return 1
    return 0  # presumably the sex is female

new_df.groupby("Gender").applyInPandas(
    binary_encode_sex, schema="Gender").show()  """

'def binary_encode_sex(value):\n    if value == \'male\':\n        return 1\n    return 0  # presumably the sex is female\n\nnew_df.groupby("Gender").applyInPandas(\n    binary_encode_sex, schema="Gender").show()  '

In [13]:
"""# getting the sex data
sex_data = df.select('Sex').distinct().rdd.flatMap(lambda x:x).collect()
exprs = [F.when(F.col('Sex') == male,1).otherwise(0)\
            .alias(str(male)) for male in sex_data]
# df = df.select(exprs+df.columns)"""

"# getting the sex data\nsex_data = df.select('Sex').distinct().rdd.flatMap(lambda x:x).collect()\nexprs = [F.when(F.col('Sex') == male,1).otherwise(0)            .alias(str(male)) for male in sex_data]\n# df = df.select(exprs+df.columns)"

In [14]:
"""exprs"""

'exprs'