In [1]:
import pyspark

In [3]:
import pandas as  pd

In [4]:
df = pd.read_csv("test1.csv")
df.head()

Unnamed: 0,name,age
0,sai,19
1,kumar,18
2,krish,29


##### Read the DataFrame in Pyspark

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practise').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/25 19:31:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
spark

In [9]:
df_spark = spark.read.csv("test1.csv")
df_spark.show()

+-----+---+
|  _c0|_c1|
+-----+---+
| name|age|
|  sai| 19|
|kumar| 18|
|krish| 29|
+-----+---+



In [10]:
#we want name and age as column name
df_spark = spark.read.option('header','true').csv('test1.csv')

In [11]:
df_spark

DataFrame[name: string, age: string]

In [12]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

#### Check the datatypes of columns

In [13]:
df_spark.printSchema() 
# here,we get age as string data type
# it consider default all the variables as string

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)



In [14]:
df_spark = spark.read.option('header','true').csv('test1.csv',inferSchema =True) 

In [15]:
df_spark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [16]:
df_spark = spark.read.csv('test1.csv',header=True,inferSchema= True) # another way
df_spark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [21]:
df_spark.dtypes

[('name', 'string'), ('age', 'int')]

In [18]:
df_spark.head(2)

[Row(name='sai', age=19), Row(name='kumar', age=18)]

#####  Select the specified columns

In [19]:
df_spark.select('name').show()

+-----+
| name|
+-----+
|  sai|
|kumar|
|krish|
+-----+



In [20]:
df_spark.select(['name','age']).show()

+-----+---+
| name|age|
+-----+---+
|  sai| 19|
|kumar| 18|
|krish| 29|
+-----+---+



In [22]:
df_spark.describe().show()
#mean,and stddev are null ,because it consider categorical variables also

+-------+-----+-----------------+
|summary| name|              age|
+-------+-----+-----------------+
|  count|    3|                3|
|   mean| null|             22.0|
| stddev| null|6.082762530298219|
|    min|krish|               18|
|    max|  sai|               29|
+-------+-----+-----------------+



#### Adding Columns in dataframe

In [27]:
df_spark = df_spark.withColumn('age after 2 years',df_spark["age"]+2)

In [28]:
df_spark.show()

+-----+---+-----------------+
| name|age|age after 2 years|
+-----+---+-----------------+
|  sai| 19|               21|
|kumar| 18|               20|
|krish| 29|               31|
+-----+---+-----------------+



#### Drop the columns

In [31]:
df_spark = df_spark.drop('age after 2 years')

In [32]:
df_spark.show()

+-----+---+
| name|age|
+-----+---+
|  sai| 19|
|kumar| 18|
|krish| 29|
+-----+---+



##### Rename the column

In [33]:
df_spark = df_spark.withColumnRenamed('name','New name')

In [34]:
df_spark.show()

+--------+---+
|New name|age|
+--------+---+
|     sai| 19|
|   kumar| 18|
|   krish| 29|
+--------+---+



#### Handling Missing Value

In [41]:
df_spark2 = spark.read.csv('test1.csv',header=True,inferSchema=True)

In [42]:
df_spark2.show()

+-----+---+
| name|age|
+-----+---+
|  sai| 19|
|kumar| 18|
|krish| 29|
+-----+---+



In [43]:
# drop the cloumns
df_spark2.drop('name').show()

+---+
|age|
+---+
| 19|
| 18|
| 29|
+---+



In [44]:
df_spark2.show()

+-----+---+
| name|age|
+-----+---+
|  sai| 19|
|kumar| 18|
|krish| 29|
+-----+---+



In [45]:
df_spark2.na.drop().show()

+-----+---+
| name|age|
+-----+---+
|  sai| 19|
|kumar| 18|
|krish| 29|
+-----+---+

