In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=65486271055a24182e9791398d8e7aa5cd488d69655cdec8e8353ad503771e9e
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
spark = SparkSession.builder.appName('DF example').getOrCreate()
pyspark.sql.session.SparkSession

pyspark.sql.session.SparkSession

In [3]:
columns = ['currency', 'value']
inputdata = [('Euro',90),('Pound',100),('Yuan',11),('Yen',2)]
rdd = spark.sparkContext.parallelize(inputdata)
rdd.collect()

[('Euro', 90), ('Pound', 100), ('Yuan', 11), ('Yen', 2)]

In [4]:
df = rdd.toDF()
df.show()

+-----+---+
|   _1| _2|
+-----+---+
| Euro| 90|
|Pound|100|
| Yuan| 11|
|  Yen|  2|
+-----+---+



#change column name

In [5]:
df.withColumnRenamed("_1", "Currency").show()

+--------+---+
|Currency| _2|
+--------+---+
|    Euro| 90|
|   Pound|100|
|    Yuan| 11|
|     Yen|  2|
+--------+---+



#create dataframe with column names from RDD

In [6]:
df1 = spark.createDataFrame(rdd).toDF(*columns)
df1.show()

+--------+-----+
|currency|value|
+--------+-----+
|    Euro|   90|
|   Pound|  100|
|    Yuan|   11|
|     Yen|    2|
+--------+-----+



In [7]:
df1.count()

4

In [8]:
df1.columns

['currency', 'value']

In [9]:
df1.printSchema()

root
 |-- currency: string (nullable = true)
 |-- value: long (nullable = true)



#create DataFrame directly from python list without using RDD

In [10]:
df2 = spark.createDataFrame(data=inputdata, schema = columns)
df2.show()

+--------+-----+
|currency|value|
+--------+-----+
|    Euro|   90|
|   Pound|  100|
|    Yuan|   11|
|     Yen|    2|
+--------+-----+



#create dataframe from datasource(CSV)

In [None]:
csvDF = spark.read.csv('Social_Network_Ads')

In [None]:
csvDF.show()
csv1 = spark.read.csv("Social_Network_Ads",header= True)
csv1.printSchema()

#create dataframe using schema

In [None]:
from pyspark.sql.types import *

In [None]:
sch = "`Name` STRING, `RegNo` INT, `Marks` INT `Grade` DOUBLE "
csv2  = spark.read.csv("Social_Network_Ads", header= True, schema=sch)

In [None]:
csv2.printSchema

Inferschema

In [None]:
csv3 = spark.read.csv('', inferSchema= )
csv3 .printSchema()

#Date and Time

In [14]:
import pyspark
spark =SparkSession.builder\
.appName('Date Time')\
.getOrCreate()

In [15]:
from pyspark.sql.functions import *
df = spark.range(5).withColumn('Today', current_date())\
.withColumn('Now', current_timestamp())

In [16]:
df.show(truncate= False)

+---+----------+--------------------------+
|id |Today     |Now                       |
+---+----------+--------------------------+
|0  |2023-10-12|2023-10-12 06:56:53.800433|
|1  |2023-10-12|2023-10-12 06:56:53.800433|
|2  |2023-10-12|2023-10-12 06:56:53.800433|
|3  |2023-10-12|2023-10-12 06:56:53.800433|
|4  |2023-10-12|2023-10-12 06:56:53.800433|
+---+----------+--------------------------+



In [17]:
df = df.withColumn('New Date', date_add('Today', 5))
df.show(truncate=False)

+---+----------+--------------------------+----------+
|id |Today     |Now                       |New Date  |
+---+----------+--------------------------+----------+
|0  |2023-10-12|2023-10-12 06:56:58.294688|2023-10-17|
|1  |2023-10-12|2023-10-12 06:56:58.294688|2023-10-17|
|2  |2023-10-12|2023-10-12 06:56:58.294688|2023-10-17|
|3  |2023-10-12|2023-10-12 06:56:58.294688|2023-10-17|
|4  |2023-10-12|2023-10-12 06:56:58.294688|2023-10-17|
+---+----------+--------------------------+----------+



In [18]:
df = df.withColumn('Prev Date', date_sub('Today',5))
df.show()


+---+----------+--------------------+----------+----------+
| id|     Today|                 Now|  New Date| Prev Date|
+---+----------+--------------------+----------+----------+
|  0|2023-10-12|2023-10-12 06:58:...|2023-10-17|2023-10-07|
|  1|2023-10-12|2023-10-12 06:58:...|2023-10-17|2023-10-07|
|  2|2023-10-12|2023-10-12 06:58:...|2023-10-17|2023-10-07|
|  3|2023-10-12|2023-10-12 06:58:...|2023-10-17|2023-10-07|
|  4|2023-10-12|2023-10-12 06:58:...|2023-10-17|2023-10-07|
+---+----------+--------------------+----------+----------+



In [20]:
df.select(abs(datediff('Prev Date', 'New Date'))).show()

+----------------------------------+
|abs(datediff(Prev Date, New Date))|
+----------------------------------+
|                                10|
|                                10|
|                                10|
|                                10|
|                                10|
+----------------------------------+



In [21]:
df= df.withColumn('String Date', lit('07-11-2011'))
df.show()

+---+----------+--------------------+----------+----------+-----------+
| id|     Today|                 Now|  New Date| Prev Date|String Date|
+---+----------+--------------------+----------+----------+-----------+
|  0|2023-10-12|2023-10-12 07:02:...|2023-10-17|2023-10-07| 07-11-2011|
|  1|2023-10-12|2023-10-12 07:02:...|2023-10-17|2023-10-07| 07-11-2011|
|  2|2023-10-12|2023-10-12 07:02:...|2023-10-17|2023-10-07| 07-11-2011|
|  3|2023-10-12|2023-10-12 07:02:...|2023-10-17|2023-10-07| 07-11-2011|
|  4|2023-10-12|2023-10-12 07:02:...|2023-10-17|2023-10-07| 07-11-2011|
+---+----------+--------------------+----------+----------+-----------+



In [22]:
df.printSchema()

root
 |-- id: long (nullable = false)
 |-- Today: date (nullable = false)
 |-- Now: timestamp (nullable = false)
 |-- New Date: date (nullable = false)
 |-- Prev Date: date (nullable = false)
 |-- String Date: string (nullable = false)



In [23]:
df = df.withColumn('Proper Date', to_date('String Date', 'dd-mm-yyyy'))
df.show()

+---+----------+--------------------+----------+----------+-----------+-----------+
| id|     Today|                 Now|  New Date| Prev Date|String Date|Proper Date|
+---+----------+--------------------+----------+----------+-----------+-----------+
|  0|2023-10-12|2023-10-12 07:03:...|2023-10-17|2023-10-07| 07-11-2011| 2011-01-07|
|  1|2023-10-12|2023-10-12 07:03:...|2023-10-17|2023-10-07| 07-11-2011| 2011-01-07|
|  2|2023-10-12|2023-10-12 07:03:...|2023-10-17|2023-10-07| 07-11-2011| 2011-01-07|
|  3|2023-10-12|2023-10-12 07:03:...|2023-10-17|2023-10-07| 07-11-2011| 2011-01-07|
|  4|2023-10-12|2023-10-12 07:03:...|2023-10-17|2023-10-07| 07-11-2011| 2011-01-07|
+---+----------+--------------------+----------+----------+-----------+-----------+



In [24]:
df = df.withColumn('Proper Date', date_format(to_date('String Date','dd-mm-yyyy'),'dd-mm-yyyy'))
df.show()

+---+----------+--------------------+----------+----------+-----------+-----------+
| id|     Today|                 Now|  New Date| Prev Date|String Date|Proper Date|
+---+----------+--------------------+----------+----------+-----------+-----------+
|  0|2023-10-12|2023-10-12 07:07:...|2023-10-17|2023-10-07| 07-11-2011| 07-00-2011|
|  1|2023-10-12|2023-10-12 07:07:...|2023-10-17|2023-10-07| 07-11-2011| 07-00-2011|
|  2|2023-10-12|2023-10-12 07:07:...|2023-10-17|2023-10-07| 07-11-2011| 07-00-2011|
|  3|2023-10-12|2023-10-12 07:07:...|2023-10-17|2023-10-07| 07-11-2011| 07-00-2011|
|  4|2023-10-12|2023-10-12 07:07:...|2023-10-17|2023-10-07| 07-11-2011| 07-00-2011|
+---+----------+--------------------+----------+----------+-----------+-----------+



In [25]:
df.select(quarter(to_date(lit('2011-11-07')))).show()

+----------------------------+
|quarter(to_date(2011-11-07))|
+----------------------------+
|                           4|
|                           4|
|                           4|
|                           4|
|                           4|
+----------------------------+



In [26]:
df.select(year(to_date(lit('2022-11-07')))).show()

+-------------------------+
|year(to_date(2022-11-07))|
+-------------------------+
|                     2022|
|                     2022|
|                     2022|
|                     2022|
|                     2022|
+-------------------------+



In [27]:
df.select(month(to_date(lit('2022-11-07')))).show()

+--------------------------+
|month(to_date(2022-11-07))|
+--------------------------+
|                        11|
|                        11|
|                        11|
|                        11|
|                        11|
+--------------------------+



In [None]:
data1 = [[1, 'BDA'], [2,'CS']]
myschema = "`ID` INT, `PROGRAM` STRING"
df1 = spark.createDataFrame(data = data1, schema= myschema)
df1.show()