In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DecimalType
import random
import datetime
import decimal

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Invoice DataFrame") \
    .getOrCreate()

# Define the schema
schema = StructType([
    StructField("InvoiceNo", StringType(), False),
    StructField("StockCode", StringType(), False),
    StructField("Quantity", IntegerType(), False),
    StructField("InvoiceDate", DateType(), False),
    StructField("UnitPrice", DecimalType(10, 2), False),
    StructField("CustomerId", IntegerType(), False),
    StructField("Country", StringType(), False)
])

# Helper function to generate random dates
def random_date(start, end):
    return start + datetime.timedelta(days=random.randint(0, (end - start).days))

# Generate data
random.seed(42)
data = []
start_date = datetime.date(2023, 1, 1)
end_date = datetime.date(2023, 12, 31)

countries = ["USA", "UK", "Germany", "France", "Canada"]
stock_codes = ["A001", "A002", "A003", "A004", "A005"]
customers = [1001, 1002, 1003, 1004, 1005]

for i in range(1, 11):  # 10 invoices
    invoice_no = f'INV{i:04d}'
    num_purchases = random.randint(1, 5)
    for _ in range(num_purchases):
        stock_code = random.choice(stock_codes)
        quantity = random.randint(1, 20)
        invoice_date = random_date(start_date, end_date)
        unit_price = decimal.Decimal(str(round(random.uniform(1.0, 100.0), 2)))
        customer_id = random.choice(customers)
        country = random.choice(countries)
        data.append((invoice_no, stock_code, quantity, invoice_date, unit_price, customer_id, country))
        if len(data) >= 500:  # Ensure we stop at 30 rows
            break
    if len(data) >= 500:
        break

# Create the DataFrame
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show(truncate=False)

# Write the DataFrame to a CSV file
output_path = r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\week8\walmart"
df.write.format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(output_path)

# Stop the Spark session
spark.stop()


+---------+---------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Quantity|InvoiceDate|UnitPrice|CustomerId|Country|
+---------+---------+--------+-----------+---------+----------+-------+
|INV0001  |A001     |9       |2023-05-06 |23.10    |1001      |Canada |
|INV0002  |A005     |14      |2023-01-17 |3.95     |1002      |UK     |
|INV0003  |A005     |1       |2023-10-15 |20.68    |1005      |France |
|INV0003  |A002     |15      |2023-10-29 |28.54    |1001      |UK     |
|INV0003  |A004     |11      |2023-05-23 |16.39    |1003      |USA    |
|INV0003  |A001     |13      |2023-02-19 |36.54    |1003      |Canada |
|INV0003  |A003     |2       |2023-08-24 |54.09    |1004      |USA    |
|INV0004  |A003     |20      |2023-07-05 |58.16    |1001      |USA    |
|INV0004  |A002     |10      |2023-02-10 |85.68    |1001      |France |
|INV0004  |A003     |15      |2023-11-22 |83.58    |1002      |Germany|
|INV0004  |A003     |7       |2023-12-10 |27.43    |1001      |C

In [2]:
from pyspark.sql import SparkSession
import random

# Initialize Spark session
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()

# Number of rows
num_rows = 200

# Sample data generation
countries = ["Spain", "Germany", "France", "Italy", "Portugal", "Netherlands"]
data = []

for _ in range(num_rows):
    country = random.choice(countries)
    week_number = random.randint(1, 52)
    number_of_invoices = random.randint(1, 20)
    total_quantity = random.randint(1, 2000)
    invoice_values = round(total_quantity * random.uniform(0.5, 5), 2)
    
    data.append((country, week_number, number_of_invoices, total_quantity, invoice_values))

# Create DataFrame
columns = ["country", "week_number", "number_of_invoices", "total_quantity", "invoice_values"]
df = spark.createDataFrame(data, columns)

# Write the DataFrame to a CSV file
output_path = r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\week8\window"
df.write.format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save(output_path)


# Show DataFrame
df.show()

# Stop the Spark session
spark.stop()


+-----------+-----------+------------------+--------------+--------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|
+-----------+-----------+------------------+--------------+--------------+
|   Portugal|         22|                19|          1525|       4975.32|
|    Germany|          2|                14|          1680|       6040.99|
|Netherlands|         49|                 5|            28|        103.78|
|      Spain|          4|                16|           730|       1909.05|
|Netherlands|         29|                 4|          1723|       3435.96|
|    Germany|         25|                 5|          1169|       5154.49|
|     France|         47|                 9|          1644|       3987.62|
|      Spain|         49|                11|           657|       2881.72|
|     France|         18|                 3|          1882|       9171.42|
|    Germany|         42|                18|          1719|       1383.78|
|     France|         45|