In [1]:
import pyspark
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark=SparkSession.builder.appName('Renaming data in Pandas df').getOrCreate()
spark

In [3]:
sc=SparkContext.getOrCreate()
data=[['Raj','Ravi','Sona'],['Mona','Ram',"Chinnu"],['Bannu','Vinith','Lucky']]

In [4]:
rdd=sc.parallelize(data)

In [5]:
rdd.collect()

[['Raj', 'Ravi', 'Sona'],
 ['Mona', 'Ram', 'Chinnu'],
 ['Bannu', 'Vinith', 'Lucky']]

In [6]:
columns=['CSE','ECE','EEE']
df=spark.createDataFrame(rdd,schema=columns)

In [7]:
df

DataFrame[CSE: string, ECE: string, EEE: string]

In [8]:
df.show()

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
|  Raj|  Ravi|  Sona|
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+



In [21]:
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [33]:
# Renaming columns
# 1) withColumnRenamed()
df.withColumnRenamed('CSE','Computer Science').show()

+----------------+------+------+
|Computer Science|   ECE|   EEE|
+----------------+------+------+
|             Raj|  Ravi|  Sona|
|            Mona|   Ram|Chinnu|
|           Bannu|Vinith| Lucky|
+----------------+------+------+



In [34]:
# renaming multiple columns
df.withColumnRenamed('CSE','CS').withColumnRenamed('EEE','EE').show()

+-----+------+------+
|   CS|   ECE|    EE|
+-----+------+------+
|  Raj|  Ravi|  Sona|
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+



In [36]:
# 2) selectExpr(exp)
df.selectExpr('CSE','ECE as ElectronicsandCommunication',"EEE").show()

+-----+---------------------------+------+
|  CSE|ElectronicsandCommunication|   EEE|
+-----+---------------------------+------+
|  Raj|                       Ravi|  Sona|
| Mona|                        Ram|Chinnu|
|Bannu|                     Vinith| Lucky|
+-----+---------------------------+------+



In [37]:
# 3) select(col)
from pyspark.sql.functions import col
df.select(col('CSE').alias("Computer Science"),col('ECE'),col('EEE')).show()


+----------------+------+------+
|Computer Science|   ECE|   EEE|
+----------------+------+------+
|             Raj|  Ravi|  Sona|
|            Mona|   Ram|Chinnu|
|           Bannu|Vinith| Lucky|
+----------------+------+------+



In [38]:
# 4) toDF(*columns)
li=['Computer Science','ECE','Electrical']
df.toDF(*li).show()

+----------------+------+----------+
|Computer Science|   ECE|Electrical|
+----------------+------+----------+
|             Raj|  Ravi|      Sona|
|            Mona|   Ram|    Chinnu|
|           Bannu|Vinith|     Lucky|
+----------------+------+----------+



In [22]:
# Converting into pandas dataframe
df1=df.toPandas()

In [25]:

print(type(df1))
df1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,CSE,ECE,EEE
0,Raj,Ravi,Sona
1,Mona,Ram,Chinnu
2,Bannu,Vinith,Lucky


In [30]:
# Renaming column names in Pandas 
# 1) using rename
df1.rename(columns={'CSE':'MECH'},inplace=True)
df1

Unnamed: 0,MECH,ECE,EEE
0,Raj,Ravi,Sona
1,Mona,Ram,Chinnu
2,Bannu,Vinith,Lucky


In [31]:
# 2) using columns
df1.columns=['Computer Science','ECE','Electrical']
df1

Unnamed: 0,Computer Science,ECE,Electrical
0,Raj,Ravi,Sona
1,Mona,Ram,Chinnu
2,Bannu,Vinith,Lucky


In [43]:
#### Select

# select all columns
df.select('*').show()

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
|  Raj|  Ravi|  Sona|
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+



In [48]:
df.select(*columns).show()

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
|  Raj|  Ravi|  Sona|
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+



In [44]:
# select CSE and EEE columns
df.select('CSE','EEE').show()

+-----+------+
|  CSE|   EEE|
+-----+------+
|  Raj|  Sona|
| Mona|Chinnu|
|Bannu| Lucky|
+-----+------+



In [46]:
# using select and df
df.select(df.CSE,df.EEE).show()

+-----+------+
|  CSE|   EEE|
+-----+------+
|  Raj|  Sona|
| Mona|Chinnu|
|Bannu| Lucky|
+-----+------+



In [47]:
df.select(df['CSE'],df['EEE']).show()

+-----+------+
|  CSE|   EEE|
+-----+------+
|  Raj|  Sona|
| Mona|Chinnu|
|Bannu| Lucky|
+-----+------+



In [45]:
# select with col
from pyspark.sql.functions import col
df.select(col('CSE'),'ECE','EEE').show()

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
|  Raj|  Ravi|  Sona|
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+



In [51]:
# Select Columns by Index
# Selects first 2 columns and top 2 rows
df.select(df.columns[:2]).show(2)

+----+----+
| CSE| ECE|
+----+----+
| Raj|Ravi|
|Mona| Ram|
+----+----+
only showing top 2 rows



In [52]:
#### filtering
# filter() 
# 1) equals condition
df.filter(df.CSE=='Mona').show()

+----+---+------+
| CSE|ECE|   EEE|
+----+---+------+
|Mona|Ram|Chinnu|
+----+---+------+



In [55]:
# not equal condition
df.filter(df.CSE!='Mona').show()
df.filter(~(df.CSE=='Mona')).show()

+-----+------+-----+
|  CSE|   ECE|  EEE|
+-----+------+-----+
|  Raj|  Ravi| Sona|
|Bannu|Vinith|Lucky|
+-----+------+-----+

+-----+------+-----+
|  CSE|   ECE|  EEE|
+-----+------+-----+
|  Raj|  Ravi| Sona|
|Bannu|Vinith|Lucky|
+-----+------+-----+



In [56]:
# 2) using col()
df.filter(col('EEE')=='Sona').show()

+---+----+----+
|CSE| ECE| EEE|
+---+----+----+
|Raj|Ravi|Sona|
+---+----+----+



In [57]:
#3) filter() with SQL Expression
# equal
df.filter(" EEE =='Sona'").show()

+---+----+----+
|CSE| ECE| EEE|
+---+----+----+
|Raj|Ravi|Sona|
+---+----+----+



In [58]:
# not equal
df.filter("EEE!='Sona'").show()
df.filter("EEE <> 'Sona'").show()

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+



In [61]:
#4) Filter with Multiple Conditions
df.filter((df.CSE=='Mona') & (df.ECE=='Ram')).show()

+----+---+------+
| CSE|ECE|   EEE|
+----+---+------+
|Mona|Ram|Chinnu|
+----+---+------+



In [62]:
# 5) Filter Based on List Values

# Filter IS IN List values
li=['Mona','Vinith','XYZ']
df.filter(df.ECE.isin(li)).show()

+-----+------+-----+
|  CSE|   ECE|  EEE|
+-----+------+-----+
|Bannu|Vinith|Lucky|
+-----+------+-----+



In [64]:
# 6) Filter NOT IS IN List values
df.filter(~(df.ECE.isin(li))).show()
df.filter(df.ECE.isin(li)==False).show()

+----+----+------+
| CSE| ECE|   EEE|
+----+----+------+
| Raj|Ravi|  Sona|
|Mona| Ram|Chinnu|
+----+----+------+

+----+----+------+
| CSE| ECE|   EEE|
+----+----+------+
| Raj|Ravi|  Sona|
|Mona| Ram|Chinnu|
+----+----+------+



In [68]:
# 7) Filter Based on Starts With, Ends With, Contains
df.filter(df.ECE.startswith("R")).show()

+----+----+------+
| CSE| ECE|   EEE|
+----+----+------+
| Raj|Ravi|  Sona|
|Mona| Ram|Chinnu|
+----+----+------+



In [73]:
df.filter(df.EEE.endswith("u")).show()

+----+---+------+
| CSE|ECE|   EEE|
+----+---+------+
|Mona|Ram|Chinnu|
+----+---+------+



In [75]:
df.filter(df.EEE.contains("nn")).show()

+----+---+------+
| CSE|ECE|   EEE|
+----+---+------+
|Mona|Ram|Chinnu|
+----+---+------+



In [76]:
# 8) like
df.filter(df.EEE.like('S%')).show()

+---+----+----+
|CSE| ECE| EEE|
+---+----+----+
|Raj|Ravi|Sona|
+---+----+----+



In [77]:
df.filter(df.EEE.like('%on%')).show()

+---+----+----+
|CSE| ECE| EEE|
+---+----+----+
|Raj|Ravi|Sona|
+---+----+----+



In [79]:
# using where in place of filter
df.where(df.EEE.like('%on%')).show()

+---+----+----+
|CSE| ECE| EEE|
+---+----+----+
|Raj|Ravi|Sona|
+---+----+----+



In [80]:
############ Manipulating

# 1) schema 
df.printSchema()

root
 |-- CSE: string (nullable = true)
 |-- ECE: string (nullable = true)
 |-- EEE: string (nullable = true)



In [81]:
# 2) show only 2 top records
df.show(2)

+----+----+------+
| CSE| ECE|   EEE|
+----+----+------+
| Raj|Ravi|  Sona|
|Mona| Ram|Chinnu|
+----+----+------+
only showing top 2 rows



In [83]:
# 3) count number of rows in Df
df.count()

3

In [86]:
# 4) count of number of columns 
len(df.columns)

3

In [87]:
# 5) names of columns
df.columns

['CSE', 'ECE', 'EEE']

In [88]:
# 6) describe()---gives summary statistics
df.describe().show()

+-------+-----+------+------+
|summary|  CSE|   ECE|   EEE|
+-------+-----+------+------+
|  count|    3|     3|     3|
|   mean| NULL|  NULL|  NULL|
| stddev| NULL|  NULL|  NULL|
|    min|Bannu|   Ram|Chinnu|
|    max|  Raj|Vinith|  Sona|
+-------+-----+------+------+



In [89]:
# 7) select() operations

In [90]:
# 8) number of distinct products
df.select('ECE').distinct().count()

3

In [91]:
# 9) Adding new columns
from pyspark.sql.functions import lit
df.withColumn("Marks",lit(90)).show()

+-----+------+------+-----+
|  CSE|   ECE|   EEE|Marks|
+-----+------+------+-----+
|  Raj|  Ravi|  Sona|   90|
| Mona|   Ram|Chinnu|   90|
|Bannu|Vinith| Lucky|   90|
+-----+------+------+-----+



In [92]:
df.show()

+-----+------+------+
|  CSE|   ECE|   EEE|
+-----+------+------+
|  Raj|  Ravi|  Sona|
| Mona|   Ram|Chinnu|
|Bannu|Vinith| Lucky|
+-----+------+------+

