In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

# Initialize SparkSession
spark = SparkSession.builder.appName("sample_data").getOrCreate()

# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("location", StringType(), True),
    StructField("salary", DoubleType(), True)
])

# Sample data with some repetitions in id and complete rows
data = [
    (1, "John Smith", 28, "New York", 75000.0),
    (2, "Emma Wilson", 34, "Los Angeles", 85000.0),
    (1, "John Smith", 28, "New York", 75000.0),  # Repeated row
    (3, "Michael Brown", 45, "Chicago", 92000.0),
    (4, "Sarah Davis", 31, "Boston", 78000.0),
    (5, "Robert Johnson", 39, "Seattle", 88000.0),
    (6, "Lisa Anderson", 29, "San Francisco", 95000.0),
    (7, "James Wilson", 42, "Miami", 82000.0),
    (8, "Jennifer Lee", 36, "Houston", 79000.0),
    (2, "Emma Wilson", 35, "Los Angeles", 87000.0),  # Same id, different details
    (9, "William Taylor", 41, "Phoenix", 81000.0),
    (10, "Emily White", 33, "Denver", 76000.0),
    (11, "David Miller", 38, "Atlanta", 84000.0),
    (12, "Jessica Brown", 30, "Dallas", 77000.0),
    (13, "Daniel Martinez", 44, "Austin", 89000.0),
    (14, "Michelle Garcia", 32, "Portland", 78000.0),
    (15, "Christopher Lee", 37, "San Diego", 86000.0),
    (16, "Amanda Wright", 31, "Philadelphia", 75000.0),
    (17, "Kevin Thompson", 40, "Las Vegas", 83000.0),
    (18, "Melissa Davis", 35, "Detroit", 79000.0),
    (19, "Ryan Wilson", 33, "Minneapolis", 77000.0),
    (20, "Laura Martinez", 36, "Sacramento", 82000.0),
    (21, "Steven Johnson", 42, "Nashville", 88000.0),
    (22, "Nicole Brown", 29, "Salt Lake City", 76000.0),
    (23, "Thomas Anderson", 38, "Orlando", 84000.0),
    (24, "Rachel Taylor", 34, "Baltimore", 81000.0),
    (25, "Joseph White", 41, "Charlotte", 87000.0),
    (15, "Christopher Lee", 37, "San Diego", 86000.0),  # Repeated row
    (26, "Rebecca Smith", 30, "Cleveland", 75000.0),
    (27, "Patrick Davis", 43, "Pittsburgh", 89000.0),
    (28, "Sandra Miller", 35, "Cincinnati", 80000.0),
    (29, "George Wilson", 39, "Kansas City", 83000.0),
    (30, "Angela Thompson", 32, "St. Louis", 78000.0),
    (31, "Kenneth Brown", 44, "Milwaukee", 86000.0),
    (32, "Catherine Lee", 31, "Columbus", 77000.0),
    (33, "Edward Martinez", 37, "Indianapolis", 82000.0),
    (34, "Diana Wright", 33, "San Antonio", 79000.0),
    (35, "Ronald Davis", 40, "Providence", 85000.0),
    (36, "Christine Wilson", 36, "Jacksonville", 81000.0),
    (37, "Timothy White", 34, "Richmond", 78000.0),
    (38, "Deborah Johnson", 42, "Memphis", 87000.0),
    (39, "Jeffrey Brown", 31, "Louisville", 76000.0),
    (40, "Julie Anderson", 38, "Buffalo", 83000.0),
    (41, "Scott Taylor", 35, "Hartford", 80000.0),
    (42, "Karen Martinez", 43, "Oklahoma City", 88000.0),
    (43, "Dennis Wilson", 30, "Birmingham", 75000.0),
    (44, "Sharon Davis", 37, "Rochester", 82000.0),
    (45, "Gregory Thompson", 39, "Raleigh", 84000.0),
    (46, "Amy Johnson", 32, "Tulsa", 77000.0),
    (47, "Peter Brown", 41, "Grand Rapids", 85000.0),
    (48, "Carol Anderson", 34, "Omaha", 79000.0)
]

# Create DataFrame
df = spark.createDataFrame(data, schema=schema)

# Show the data
df.show()

+---+---------------+---+-------------+-------+
| id|           name|age|     location| salary|
+---+---------------+---+-------------+-------+
|  1|     John Smith| 28|     New York|75000.0|
|  2|    Emma Wilson| 34|  Los Angeles|85000.0|
|  1|     John Smith| 28|     New York|75000.0|
|  3|  Michael Brown| 45|      Chicago|92000.0|
|  4|    Sarah Davis| 31|       Boston|78000.0|
|  5| Robert Johnson| 39|      Seattle|88000.0|
|  6|  Lisa Anderson| 29|San Francisco|95000.0|
|  7|   James Wilson| 42|        Miami|82000.0|
|  8|   Jennifer Lee| 36|      Houston|79000.0|
|  2|    Emma Wilson| 35|  Los Angeles|87000.0|
|  9| William Taylor| 41|      Phoenix|81000.0|
| 10|    Emily White| 33|       Denver|76000.0|
| 11|   David Miller| 38|      Atlanta|84000.0|
| 12|  Jessica Brown| 30|       Dallas|77000.0|
| 13|Daniel Martinez| 44|       Austin|89000.0|
| 14|Michelle Garcia| 32|     Portland|78000.0|
| 15|Christopher Lee| 37|    San Diego|86000.0|
| 16|  Amanda Wright| 31| Philadelphia|7

In [0]:
## Geting data in ascending order (by default)
df.sort(df.id).show()

## OR
# df.sort(df.id).show()

+---+---------------+---+-------------+-------+
| id|           name|age|     location| salary|
+---+---------------+---+-------------+-------+
|  1|     John Smith| 28|     New York|75000.0|
|  1|     John Smith| 28|     New York|75000.0|
|  2|    Emma Wilson| 35|  Los Angeles|87000.0|
|  2|    Emma Wilson| 34|  Los Angeles|85000.0|
|  3|  Michael Brown| 45|      Chicago|92000.0|
|  4|    Sarah Davis| 31|       Boston|78000.0|
|  5| Robert Johnson| 39|      Seattle|88000.0|
|  6|  Lisa Anderson| 29|San Francisco|95000.0|
|  7|   James Wilson| 42|        Miami|82000.0|
|  8|   Jennifer Lee| 36|      Houston|79000.0|
|  9| William Taylor| 41|      Phoenix|81000.0|
| 10|    Emily White| 33|       Denver|76000.0|
| 11|   David Miller| 38|      Atlanta|84000.0|
| 12|  Jessica Brown| 30|       Dallas|77000.0|
| 13|Daniel Martinez| 44|       Austin|89000.0|
| 14|Michelle Garcia| 32|     Portland|78000.0|
| 15|Christopher Lee| 37|    San Diego|86000.0|
| 15|Christopher Lee| 37|    San Diego|8

In [0]:
## Geting data in descending order
df.sort(df.id.desc()).show()

# OR
# df.orderBy(df.id.desc()).show()

+---+----------------+---+-------------+-------+
| id|            name|age|     location| salary|
+---+----------------+---+-------------+-------+
| 48|  Carol Anderson| 34|        Omaha|79000.0|
| 47|     Peter Brown| 41| Grand Rapids|85000.0|
| 46|     Amy Johnson| 32|        Tulsa|77000.0|
| 45|Gregory Thompson| 39|      Raleigh|84000.0|
| 44|    Sharon Davis| 37|    Rochester|82000.0|
| 43|   Dennis Wilson| 30|   Birmingham|75000.0|
| 42|  Karen Martinez| 43|Oklahoma City|88000.0|
| 41|    Scott Taylor| 35|     Hartford|80000.0|
| 40|  Julie Anderson| 38|      Buffalo|83000.0|
| 39|   Jeffrey Brown| 31|   Louisville|76000.0|
| 38| Deborah Johnson| 42|      Memphis|87000.0|
| 37|   Timothy White| 34|     Richmond|78000.0|
| 36|Christine Wilson| 36| Jacksonville|81000.0|
| 35|    Ronald Davis| 40|   Providence|85000.0|
| 34|    Diana Wright| 33|  San Antonio|79000.0|
| 33| Edward Martinez| 37| Indianapolis|82000.0|
| 32|   Catherine Lee| 31|     Columbus|77000.0|
| 31|   Kenneth Brow

Sorting data on multiple columns

In [0]:
df.sort(df.id.desc(), df.salary).show()

+---+----------------+---+-------------+-------+
| id|            name|age|     location| salary|
+---+----------------+---+-------------+-------+
| 48|  Carol Anderson| 34|        Omaha|79000.0|
| 47|     Peter Brown| 41| Grand Rapids|85000.0|
| 46|     Amy Johnson| 32|        Tulsa|77000.0|
| 45|Gregory Thompson| 39|      Raleigh|84000.0|
| 44|    Sharon Davis| 37|    Rochester|82000.0|
| 43|   Dennis Wilson| 30|   Birmingham|75000.0|
| 42|  Karen Martinez| 43|Oklahoma City|88000.0|
| 41|    Scott Taylor| 35|     Hartford|80000.0|
| 40|  Julie Anderson| 38|      Buffalo|83000.0|
| 39|   Jeffrey Brown| 31|   Louisville|76000.0|
| 38| Deborah Johnson| 42|      Memphis|87000.0|
| 37|   Timothy White| 34|     Richmond|78000.0|
| 36|Christine Wilson| 36| Jacksonville|81000.0|
| 35|    Ronald Davis| 40|   Providence|85000.0|
| 34|    Diana Wright| 33|  San Antonio|79000.0|
| 33| Edward Martinez| 37| Indianapolis|82000.0|
| 32|   Catherine Lee| 31|     Columbus|77000.0|
| 31|   Kenneth Brow