 PySpark, an interface for Apache Spark in Python. PySpark is used for large-scale data processing

In [1]:
import pyspark

In [3]:
import pandas as pd

In [4]:
pd.read_csv('sample.csv')

Unnamed: 0,Name,age,Experience,Salary
0,sourav,31,10,30000
1,sunil,30,8,25000
2,soyel,29,4,20000
3,souvik,24,3,20000
4,apurba,21,1,15000
5,pk,23,2,18000


In [5]:
type(pd.read_csv('sample.csv'))

pandas.core.frame.DataFrame

In [6]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName("intro").getOrCreate()

In [8]:
spark

In [9]:
df_pyspark = spark.read.csv('sample.csv')

In [10]:
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string]

In [11]:
df_pyspark = spark.read.option('header','true').csv('sample.csv')

In [12]:
df_pyspark

DataFrame[Name: string, age: string, Experience: string, Salary: string]

In [13]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



In [15]:
df_pyspark.show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|sourav| 31|        10| 30000|
| sunil| 30|         8| 25000|
| soyel| 29|         4| 20000|
|souvik| 24|         3| 20000|
|apurba| 21|         1| 15000|
|    pk| 23|         2| 18000|
+------+---+----------+------+



In [17]:
df_pyspark.head(4)

[Row(Name='sourav', age='31', Experience='10', Salary='30000'),
 Row(Name='sunil', age='30', Experience='8', Salary='25000'),
 Row(Name='soyel', age='29', Experience='4', Salary='20000'),
 Row(Name='souvik', age='24', Experience='3', Salary='20000')]

In [18]:
df_pyspark.select(['Name', 'Experience']).show()

+------+----------+
|  Name|Experience|
+------+----------+
|sourav|        10|
| sunil|         8|
| soyel|         4|
|souvik|         3|
|apurba|         1|
|    pk|         2|
+------+----------+



In [19]:
df_pyspark['Name']

Column<'Name'>

In [21]:
df_pyspark.dtypes

[('Name', 'string'),
 ('age', 'string'),
 ('Experience', 'string'),
 ('Salary', 'string')]

In [22]:
df_pyspark=spark.read.option('header','true').csv('sample.csv',inferSchema=True)

In [23]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [24]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [26]:
df_pyspark=spark.read.csv('sample.csv',header=True,inferSchema=True)
df_pyspark.show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|sourav| 31|        10| 30000|
| sunil| 30|         8| 25000|
| soyel| 29|         4| 20000|
|souvik| 24|         3| 20000|
|apurba| 21|         1| 15000|
|    pk| 23|         2| 18000|
+------+---+----------+------+



In [27]:
### Check the schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [28]:

df_pyspark.select(['Name','Experience']).show()

+------+----------+
|  Name|Experience|
+------+----------+
|sourav|        10|
| sunil|         8|
| soyel|         4|
|souvik|         3|
|apurba|         1|
|    pk|         2|
+------+----------+



In [29]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [30]:
### Adding Columns in data frame
df_pyspark=df_pyspark.withColumn('Experience After 2 year',df_pyspark['Experience']+2)

In [31]:
df_pyspark.show()

+------+---+----------+------+-----------------------+
|  Name|age|Experience|Salary|Experience After 2 year|
+------+---+----------+------+-----------------------+
|sourav| 31|        10| 30000|                     12|
| sunil| 30|         8| 25000|                     10|
| soyel| 29|         4| 20000|                      6|
|souvik| 24|         3| 20000|                      5|
|apurba| 21|         1| 15000|                      3|
|    pk| 23|         2| 18000|                      4|
+------+---+----------+------+-----------------------+



In [32]:
### Drop the columns
df_pyspark = df_pyspark.drop('Experience After 2 year')

In [33]:
df_pyspark.show()

+------+---+----------+------+
|  Name|age|Experience|Salary|
+------+---+----------+------+
|sourav| 31|        10| 30000|
| sunil| 30|         8| 25000|
| soyel| 29|         4| 20000|
|souvik| 24|         3| 20000|
|apurba| 21|         1| 15000|
|    pk| 23|         2| 18000|
+------+---+----------+------+



In [34]:
# rename columns
df_pyspark = df_pyspark.withColumnRenamed('Name', 'New Name').show()

+--------+---+----------+------+
|New Name|age|Experience|Salary|
+--------+---+----------+------+
|  sourav| 31|        10| 30000|
|   sunil| 30|         8| 25000|
|   soyel| 29|         4| 20000|
|  souvik| 24|         3| 20000|
|  apurba| 21|         1| 15000|
|      pk| 23|         2| 18000|
+--------+---+----------+------+

