### PySpark Intro

In [None]:
#!pip3 install pyspark

In [1]:
import pyspark

In [75]:
import pandas as pd
pd.read_csv('data.csv')

Unnamed: 0,Name,Age,Experience
0,Shri,29,2
1,Kris,22,3
2,Adarsh,22,4


In [76]:
from pyspark.sql import SparkSession

In [77]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [78]:
spark

In [79]:
df_pySpark = spark.read.csv('data.csv')

In [80]:
df_pySpark

DataFrame[_c0: string, _c1: string, _c2: string]

In [81]:
df_pySpark.show()

+------+---+----------+
|   _c0|_c1|       _c2|
+------+---+----------+
|  Name|Age|Experience|
|  Shri| 29|         2|
|  Kris| 22|         3|
|Adarsh| 22|         4|
+------+---+----------+



In [82]:
df_pySpark = spark.read.option('header', 'true').csv('data.csv')

In [83]:
df_pySpark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|  Shri| 29|         2|
|  Kris| 22|         3|
|Adarsh| 22|         4|
+------+---+----------+



In [84]:
type(df_pySpark)

pyspark.sql.dataframe.DataFrame

In [85]:
df_pySpark.head()

Row(Name='Shri', Age='29', Experience='2')

In [86]:
# Check Schema
df_pySpark.printSchema

<bound method DataFrame.printSchema of DataFrame[Name: string, Age: string, Experience: string]>

### Pyspark DataFrames Part I

In [87]:
# Here infer schema is used for the understanding of Data types becuase as you can observe above that 
# by default for all columns it was taking as String as datatype 
df_pySpark = spark.read.option('header', 'true').csv('data.csv', inferSchema = True)

In [88]:
df_pySpark = spark.read.csv('data.csv', header = True, inferSchema = True) # The other way of specifying the same

In [89]:
df_pySpark.printSchema # As you can see now the age and experience as the type int

<bound method DataFrame.printSchema of DataFrame[Name: string, Age: int, Experience: int]>

In [90]:
type(df_pySpark)

pyspark.sql.dataframe.DataFrame

In [91]:
df_pySpark.columns

['Name', 'Age', 'Experience']

In [92]:
#Selecting a particular column

df_pySpark.select('Name')

DataFrame[Name: string]

In [93]:
df_pySpark.select('Name').show()

+------+
|  Name|
+------+
|  Shri|
|  Kris|
|Adarsh|
+------+



In [94]:
# Selecting multiple columns

df_pySpark.select(['Name', 'Age'])

DataFrame[Name: string, Age: int]

In [95]:
df_pySpark.select(['Name','Age']).show()

+------+---+
|  Name|Age|
+------+---+
|  Shri| 29|
|  Kris| 22|
|Adarsh| 22|
+------+---+



In [96]:
df_pySpark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [97]:
df_pySpark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [98]:
df_pySpark.describe().show()

+-------+------+------------------+----------+
|summary|  Name|               Age|Experience|
+-------+------+------------------+----------+
|  count|     3|                 3|         3|
|   mean|  NULL|24.333333333333332|       3.0|
| stddev|  NULL| 4.041451884327381|       1.0|
|    min|Adarsh|                22|         2|
|    max|  Shri|                29|         4|
+-------+------+------------------+----------+



In [99]:
df_pySpark = df_pySpark.withColumn('Experience After 2 Years', df_pySpark['Experience']+2)

In [100]:
df_pySpark.show()

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience After 2 Years|
+------+---+----------+------------------------+
|  Shri| 29|         2|                       4|
|  Kris| 22|         3|                       5|
|Adarsh| 22|         4|                       6|
+------+---+----------+------------------------+



In [101]:
# post reassigning the variable 
df_pySpark.show()

+------+---+----------+------------------------+
|  Name|Age|Experience|Experience After 2 Years|
+------+---+----------+------------------------+
|  Shri| 29|         2|                       4|
|  Kris| 22|         3|                       5|
|Adarsh| 22|         4|                       6|
+------+---+----------+------------------------+



In [103]:
df_pySpark = df_pySpark.drop('Experience After 2 Years') # Deleting a column

In [104]:
df_pySpark.show()

+------+---+----------+
|  Name|Age|Experience|
+------+---+----------+
|  Shri| 29|         2|
|  Kris| 22|         3|
|Adarsh| 22|         4|
+------+---+----------+



In [105]:
df_pySpark.withColumnRenamed('Name','Username').show()

+--------+---+----------+
|Username|Age|Experience|
+--------+---+----------+
|    Shri| 29|         2|
|    Kris| 22|         3|
|  Adarsh| 22|         4|
+--------+---+----------+



### Handling Missing Values

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [4]:
df = spark.read.csv('data.csv', header = True, inferSchema = True)

In [5]:
df.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Shri|  29|         2|  NULL|
|  Kris|  22|         3| 12000|
|Adarsh|  22|         4| 81000|
| Anand|NULL|        12| 10101|
|  NULL|  43|      NULL| 12243|
|  NULL|  21|         4|  8425|
+------+----+----------+------+



In [6]:
df.drop('Name')

DataFrame[Age: int, Experience: int, Salary: int]

In [7]:
df

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [8]:
# Dropping NAN values

In [9]:
df.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Kris| 22|         3| 12000|
|Adarsh| 22|         4| 81000|
+------+---+----------+------+



In [10]:
df

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

In [11]:
df.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Shri|  29|         2|  NULL|
|  Kris|  22|         3| 12000|
|Adarsh|  22|         4| 81000|
| Anand|NULL|        12| 10101|
|  NULL|  43|      NULL| 12243|
|  NULL|  21|         4|  8425|
+------+----+----------+------+



In [12]:
df.na.drop(how = "any").show() # Here it deletes a row if even one value is null

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Kris| 22|         3| 12000|
|Adarsh| 22|         4| 81000|
+------+---+----------+------+



In [13]:
df.na.drop(how = "all").show() # Here it wont deletes a row even if a value is non null

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Shri|  29|         2|  NULL|
|  Kris|  22|         3| 12000|
|Adarsh|  22|         4| 81000|
| Anand|NULL|        12| 10101|
|  NULL|  43|      NULL| 12243|
|  NULL|  21|         4|  8425|
+------+----+----------+------+



In [16]:
## Threshold values 
# Means setting a value to be checked for non value like
# Atleast 'n' number of values should be non null

df.na.drop(how = "any", thresh = 3).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Shri|  29|         2|  NULL|
|  Kris|  22|         3| 12000|
|Adarsh|  22|         4| 81000|
| Anand|NULL|        12| 10101|
|  NULL|  21|         4|  8425|
+------+----+----------+------+



In [19]:
## Subsetting
# dropping the rows which are null in the specified column of subset
df.na.drop(how = "any", subset = ['Age', 'Experience']).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Shri| 29|         2|  NULL|
|  Kris| 22|         3| 12000|
|Adarsh| 22|         4| 81000|
|  NULL| 21|         4|  8425|
+------+---+----------+------+



In [20]:
### Filling the Missing Values
# you can fill the values of na by following method like it takes
# 2 paramets i.e.,(value, subset (optional - of type list))

df.na.fill("Missing values", [ 'Experience', 'Age' ]).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
|  Shri|  29|         2|  NULL|
|  Kris|  22|         3| 12000|
|Adarsh|  22|         4| 81000|
| Anand|NULL|        12| 10101|
|  NULL|  43|      NULL| 12243|
|  NULL|  21|         4|  8425|
+------+----+----------+------+



In [32]:
from pyspark.ml.feature import Imputer


imputer = Imputer(
    inputCols = ['Age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
    ).setStrategy("mode")


# Similarly you can try it for mode and median

In [33]:
# Add imputation cols to df

imputer.fit(df).transform(df).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
|  Shri|  29|         2|  NULL|         29|                 2|          8425|
|  Kris|  22|         3| 12000|         22|                 3|         12000|
|Adarsh|  22|         4| 81000|         22|                 4|         81000|
| Anand|NULL|        12| 10101|         22|                12|         10101|
|  NULL|  43|      NULL| 12243|         43|                 4|         12243|
|  NULL|  21|         4|  8425|         21|                 4|          8425|
+------+----+----------+------+-----------+------------------+--------------+



### Filter Operations

In [34]:
df = spark.read.csv('data.csv', header = True, inferSchema = True)

In [35]:
df.show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|  Shri| 29|         2| 12411|
|  Kris| 22|         3| 12000|
|Adarsh| 22|         4| 81000|
| Anand| 44|        12| 10101|
| Karan| 43|         2| 12243|
| Kunal| 21|         4|  8425|
+------+---+----------+------+



In [37]:
### selecting/filtering the people who has less or equal to 10K salary

df.filter("Salary <= 10000").show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Kunal| 21|         4|  8425|
+-----+---+----------+------+



In [38]:
df.filter("Salary <= 10000").select(['Name','Age']).show()

+-----+---+
| Name|Age|
+-----+---+
|Kunal| 21|
+-----+---+



In [39]:
df.filter(df["Salary"] <= 10000).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Kunal| 21|         4|  8425|
+-----+---+----------+------+



In [40]:
df.filter((df["Salary"] >= 10000) & 
        (df["Salary"] <= 30000) ).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
| Shri| 29|         2| 12411|
| Kris| 22|         3| 12000|
|Anand| 44|        12| 10101|
|Karan| 43|         2| 12243|
+-----+---+----------+------+



In [42]:
df.filter(~(df["Salary"] <= 20000)).show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
|Adarsh| 22|         4| 81000|
+------+---+----------+------+

