In [23]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("Colab PySpark Setup") \
.getOrCreate()

spark

In [24]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
 #1. DataFrame Creation and Inspection
import pandas as pd

# Replace the path with your actual file path
file_path = '/content/drive/MyDrive/ColabData/sales_dataset.csv'

# Load the CSV
df = pd.read_csv(file_path)

# Display first 5 and last 5 records
print("First 5 records:")
print(df.head())
print("\nLast 5 records:")
print(df.tail())

# Print schema and check data types
print("\nData Types:")
print(df.dtypes)


First 5 records:
   OrderID    CustomerName ProductCategory  Amount   OrderDate DeliveryStatus  \
0     2824   Donald Walker           Books  783.04  2024-12-26       Returned   
1     7912    Brandon Hall       Groceries  905.00  2024-09-12      Cancelled   
2     4611    Donald Booth         Fashion  657.96  2025-01-12       Returned   
3     3547  Phillip Garcia         Fashion  606.89  2024-03-24       Returned   
4     8527    Valerie Gray            Toys   77.87  2024-08-04      Delivered   

   Discount              City  PaymentMode CustomerSince  
0      0.15      Lake Joyside  Credit Card    2020-10-15  
1      0.03     New Jamesside       Wallet    2022-03-15  
2      0.01      Lake Roberto       Wallet    2021-08-07  
3      0.15  West Melanieview       Wallet    2020-08-08  
4      0.17         Mariastad         Cash    2022-11-15  

Last 5 records:
     OrderID     CustomerName ProductCategory  Amount   OrderDate  \
495     2930     Jaime Harris         Fashion  680.00  2

In [6]:
#2. Selection, Renaming, and Filtering
# Select only OrderID, CustomerName, Amount
selected_df = df[['OrderID', 'CustomerName', 'Amount']]
print(selected_df.head())

# Rename Amount to OrderAmount
df = df.rename(columns={'Amount': 'OrderAmount'})
print(df[['OrderID', 'CustomerName', 'OrderAmount']].head())

# Filter orders where OrderAmount > 500
high_value_orders = df[df['OrderAmount'] > 500]
print(high_value_orders)

# Filter orders from a specific city using .query()
city_orders = df.query("City == 'Lake Joyside'")
print(city_orders)


   OrderID    CustomerName  Amount
0     2824   Donald Walker  783.04
1     7912    Brandon Hall  905.00
2     4611    Donald Booth  657.96
3     3547  Phillip Garcia  606.89
4     8527    Valerie Gray   77.87
   OrderID    CustomerName  OrderAmount
0     2824   Donald Walker       783.04
1     7912    Brandon Hall       905.00
2     4611    Donald Booth       657.96
3     3547  Phillip Garcia       606.89
4     8527    Valerie Gray        77.87
     OrderID        CustomerName ProductCategory  OrderAmount   OrderDate  \
0       2824       Donald Walker           Books       783.04  2024-12-26   
1       7912        Brandon Hall       Groceries       905.00  2024-09-12   
2       4611        Donald Booth         Fashion       657.96  2025-01-12   
3       3547      Phillip Garcia         Fashion       606.89  2024-03-24   
9       6155  Jonathan Wilkerson         Fashion       882.68  2024-10-14   
..       ...                 ...             ...          ...         ...   
493     674

In [7]:
#3. Data Manipulation
# Drop CustomerSince column
df = df.drop(columns=['CustomerSince'])

# Add FinalAmount = OrderAmount - (OrderAmount * Discount)
df['FinalAmount'] = df['OrderAmount'] - (df['OrderAmount'] * df['Discount'])

# Sort by FinalAmount descending
sorted_df = df.sort_values(by='FinalAmount', ascending=False)
print(sorted_df[['OrderID', 'CustomerName', 'FinalAmount']].head())

# Replace all “Cancelled” status with “Order Cancelled”
df['DeliveryStatus'] = df['DeliveryStatus'].replace('Cancelled', 'Order Cancelled')
print(df['DeliveryStatus'].value_counts())


     OrderID    CustomerName  FinalAmount
43      5573  Jordan Frazier     961.4290
433     8474     Heidi Brown     949.5318
485     8889     Karen Garza     938.4020
33      2127    Jaclyn Moore     923.9868
234     9806   Samantha Gill     923.6481
DeliveryStatus
Order Cancelled    149
Delivered          119
Returned           117
Pending            115
Name: count, dtype: int64


In [8]:
#4. Aggregations and GroupBy
# Count of orders by DeliveryStatus
print(df['DeliveryStatus'].value_counts())

# Average OrderAmount by ProductCategory
print(df.groupby('ProductCategory')['OrderAmount'].mean())

# Group by City and show total sales
print(df.groupby('City')['OrderAmount'].sum().sort_values(ascending=False))


DeliveryStatus
Order Cancelled    149
Delivered          119
Returned           117
Pending            115
Name: count, dtype: int64
ProductCategory
Books          568.600377
Electronics    551.745000
Fashion        500.630824
Groceries      459.517864
Toys           534.283750
Name: OrderAmount, dtype: float64
City
Smithside         1842.13
Meganside         1607.89
West Jessica      1249.33
Armstrongfort     1226.18
East Robert       1194.50
                   ...   
Millerview          20.64
Grayside            14.09
West Dawnmouth      12.80
Reedmouth           12.37
Williamsmouth       10.78
Name: OrderAmount, Length: 489, dtype: float64


In [9]:
#5. Null Handling & Update
# Inject nulls in City column for demonstration
df.loc[0:5, 'City'] = None

# Fill nulls with 'Unknown'
df['City'] = df['City'].fillna('Unknown')

# Drop rows with any nulls (optional)
# df = df.dropna()

# Tag high-value customers (OrderAmount > 800)
df['HighValueCustomer'] = df['OrderAmount'].apply(lambda x: 'Yes' if x > 800 else 'No')
print(df[['OrderID', 'OrderAmount', 'HighValueCustomer']].head())


   OrderID  OrderAmount HighValueCustomer
0     2824       783.04                No
1     7912       905.00               Yes
2     4611       657.96                No
3     3547       606.89                No
4     8527        77.87                No


In [14]:
#6. Date & Time Functions
from datetime import datetime

# Convert OrderDate and CustomerSince to datetime
df['OrderDate'] = pd.to_datetime(df['OrderDate'])
df['CustomerSince'] = pd.to_datetime(df['CustomerSince'])

# Extract year and month from OrderDate
df['OrderYear'] = df['OrderDate'].dt.year
df['OrderMonth'] = df['OrderDate'].dt.month

# Calculate customer loyalty in years
df['LoyaltyYears'] = (pd.to_datetime("today") - df['CustomerSince']).dt.days // 365
print(df[['CustomerName', 'OrderYear', 'OrderMonth', 'LoyaltyYears']].head())


     CustomerName  OrderYear  OrderMonth  LoyaltyYears
0   Donald Walker       2024          12             4
1    Brandon Hall       2024           9             3
2    Donald Booth       2025           1             3
3  Phillip Garcia       2024           3             4
4    Valerie Gray       2024           8             2


In [20]:
#7. Joins and Unions
# Get unique cities
unique_cities = df['City'].dropna().unique()
num_cities = len(unique_cities)

# Repeat and slice region names to match city count
region_list = (['North', 'South', 'East', 'West'] * ((num_cities // 4) + 1))[:num_cities]

# Create mapping DataFrame
city_region_df = pd.DataFrame({
    'City': unique_cities,
    'Region': region_list
})

# Inner join
inner_join = pd.merge(df, city_region_df, on='City', how='inner')
print(inner_join[['City', 'Region']].head())

# Left join
left_join = pd.merge(df, city_region_df, on='City', how='left')
print(left_join[['City', 'Region']].head())


               City Region
0      Lake Joyside  North
1     New Jamesside  South
2      Lake Roberto   East
3  West Melanieview   West
4         Mariastad  North
               City Region
0      Lake Joyside  North
1     New Jamesside  South
2      Lake Roberto   East
3  West Melanieview   West
4         Mariastad  North


In [21]:
#8. Complex JSON Simulation
import json

# Convert each order to a JSON string
df['OrderJSON'] = df.apply(lambda x: json.dumps(x.to_dict()), axis=1)
print(df['OrderJSON'].head())

# Load JSON strings back into dictionaries
df['ParsedJSON'] = df['OrderJSON'].apply(json.loads)
print(df['ParsedJSON'].head())


0    {"OrderID": 2824, "CustomerName": "Donald Walk...
1    {"OrderID": 7912, "CustomerName": "Brandon Hal...
2    {"OrderID": 4611, "CustomerName": "Donald Boot...
3    {"OrderID": 3547, "CustomerName": "Phillip Gar...
4    {"OrderID": 8527, "CustomerName": "Valerie Gra...
Name: OrderJSON, dtype: object
0    {'OrderID': 2824, 'CustomerName': 'Donald Walk...
1    {'OrderID': 7912, 'CustomerName': 'Brandon Hal...
2    {'OrderID': 4611, 'CustomerName': 'Donald Boot...
3    {'OrderID': 3547, 'CustomerName': 'Phillip Gar...
4    {'OrderID': 8527, 'CustomerName': 'Valerie Gra...
Name: ParsedJSON, dtype: object


In [27]:
# Function to tag orders by size
def tag_order(amount):
    if amount > 800:
        return "Big"
    elif amount > 300:
        return "Medium"
    else:
        return "Small"

# Apply function (use correct column name)
df['OrderSize'] = df['Amount'].apply(tag_order)
print(df[['OrderID', 'Amount', 'OrderSize']].head())


   OrderID  Amount OrderSize
0     2824  783.04    Medium
1     7912  905.00       Big
2     4611  657.96    Medium
3     3547  606.89    Medium
4     8527   77.87     Small
