INSTALLATION AND IMPORTING LIBRARIES NEEDED

In [1]:
pip install fastavro faker

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import random
import fastavro
from fastavro.schema import load_schema
from faker import Faker
from datetime import datetime, timedelta
from decimal import Decimal

CODE FOR DATA GENERATION

In [3]:
fake = Faker() # we initialize faker for data generation

full_schema = { #definition of a unified schema (Location only once)
    "type": "record",
    "name": "RideHailingEvent",
    "namespace": "com.ridehailing",
    "fields": [
        {
            "name": "event_type",
            "type": {"type": "enum", "name": "EventType", "symbols": ["RideRequest", "RideStatus"]}
        },
        {
            "name": "ride_request", # Defining the Passenger Ride Request Schema
            "type": [
                "null",
                {
                    "type": "record",
                    "name": "RideRequest",
                    "fields": [
                        {"name": "passenger_id", "type": "string"}, #passengerID: Unique ID for each passenger
                        {"name": "pickup_location", "type": { #pick up location using Location schema
                            "type": "record",
                            "name": "Location",
                            "fields": [
                                {"name": "latitude", "type": "float"}, #latitude is part of location schema
                                {"name": "longitude", "type": "float"} #longitude is part of location shema
                            ]
                        }},
                        {"name": "dropoff_location", "type": "Location"}, #dropoff location using Location shema
                        {"name": "timestamp", "type": "string"}, #timestamp is when the request was made
                        {"name": "status", "type": {"type": "enum", "name": "Status", "symbols": ["Requested", "Canceled"]}}, #status can be either requested or canceled
                        {"name": "estimated_duration", "type": "int"}, #predicted ride time in minutes
                        {"name": "estimated_price", "type": "float"} #estimated price of service in dollars
                    ]
                }
            ]
        },
        {
            "name": "ride_status", #Defining the Ride Status Schema
            "type": [
                "null",
                {
                    "type": "record",
                    "name": "RideStatus",
                    "fields": [
                        {"name": "ride_id", "type": "string"}, #unique ride identifier
                        {"name": "driver_id", "type": "string"}, #unique driver identifier
                        {"name": "passenger_id", "type": "string"}, #unique passenger identifier
                        {"name": "pickup_location", "type": "Location"}, # pick up location using previously defined Location schema
                        {"name": "dropoff_location", "type": "Location"}, # drop off location using previously defined Location schema
                        {"name": "timestamp", "type": "string"}, # when status updates occur
                        {"name": "status", "type": {"type": "enum", "name": "RideStatusEnum", "symbols": ["Accepted", "Ongoing", "Completed"]}}, # status which can be either accepted, ongoing or completed
                        {"name": "actual_duration", "type": "int"}, #actual time taken in ,minutes
                        {"name": "final_price", "type": "float"} #final price for service amount
                    ]
                }
            ]
        }
    ]
}

# Custom JSON encoder that ensures decimal values are properly converted into float values before saving them in JSON format
class DecimalEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Decimal):
            return float(obj)
        return super(DecimalEncoder, self).default(obj)

def generateRideRequest(): #Function to Generate Random Passenger Ride Requests
    return {
        "event_type": "RideRequest",
        "ride_request": {
            "passenger_id": fake.uuid4(),
            "pickup_location": {"latitude": float(fake.latitude()), "longitude": float(fake.longitude())},
            "dropoff_location": {"latitude": float(fake.latitude()), "longitude": float(fake.longitude())},
            "timestamp": datetime.now().isoformat(),
            "status": random.choice(["Requested", "Canceled"]),
            "estimated_duration": random.randint(5, 60),
            "estimated_price": float(round(random.uniform(5.0, 50.0), 2))
        },
        "ride_status": None
    }


def generateRideStatus(): ##Function to generate a random ride status updates
    return {
        "event_type": "RideStatus",
        "ride_request": None,
        "ride_status": {
            "ride_id": fake.uuid4(),
            "driver_id": fake.uuid4(),
            "passenger_id": fake.uuid4(),
            "pickup_location": {"latitude": float(fake.latitude()), "longitude": float(fake.longitude())},
            "dropoff_location": {"latitude": float(fake.latitude()), "longitude": float(fake.longitude())},
            "timestamp": datetime.now().isoformat(),
            "status": random.choice(["Accepted", "Ongoing", "Completed"]),
            "actual_duration": random.randint(5, 60),
            "final_price": float(round(random.uniform(5.0, 50.0), 2))
        }
    }

rideRequests = [generateRideRequest() for _ in range(5)] # generating multiple ride requests (5 in total)
rideSatuses = [generateRideStatus() for _ in range(5)] # generating multiple ride statuses (5 in total)
all_events = rideRequests + rideSatuses

with open("ride_events.json", "w") as f: #sving in JSON format
    json.dump(all_events, f, indent=4, cls=DecimalEncoder)

def save_avro(data, schema, filename): #saving in AVRO format
    with open(filename, "wb") as out:
        fastavro.writer(out, schema, data)

save_avro(all_events, full_schema, "ride_events.avro")

print(" We have successfully generated ride request and ride status data in JSON and AVRO formats.")

 We have successfully generated ride request and ride status data in JSON and AVRO formats.


In [45]:
!git config --global user.name "VCAM101"
!git config --global user.email "varino.ieu2021@student.ie.edu"

In [None]:
!pwd  # To show current directory

/Users/valeriaarinomontero/Desktop/2nd Semester (4th Year)/STREAM ANALYTICS/GROUP PROJECTS/MILESTONE 1


In [None]:
!rm -rf .git  # Remove all previous Git history 
!git init  # Initializes a new Git repository

Initialized empty Git repository in /Users/valeriaarinomontero/Desktop/2nd Semester (4th Year)/STREAM ANALYTICS/GROUP PROJECTS/MILESTONE 1/.git/


In [48]:
!git add .  # Stages all new and modified files
!git commit -m "Initial commit - Uploading project to STREAM-ANALYTICS-GROUP-PROJECT"

[main (root-commit) a65d415] Initial commit - Uploading project to STREAM-ANALYTICS-GROUP-PROJECT
 5 files changed, 596 insertions(+)
 create mode 100644 MILESTONE 1 STREAM ANALYTICS.pptx
 create mode 100644 MILESTONE1.ipynb
 create mode 100644 Stream Analytics Group Presentation Milestone 1.pdf
 create mode 100644 ride_events.avro
 create mode 100644 ride_events.json


In [52]:
!git filter-branch --force --index-filter \
'git rm --cached --ignore-unmatch MILESTONE1.ipynb' \
--prune-empty --tag-name-filter cat -- --all

	 rewrites.  Hit Ctrl-C before proceeding to abort, then use an
	 alternative filtering tool such as 'git filter-repo'
	 (https://github.com/newren/git-filter-repo/) instead.  See the
Proceeding with filter-branch...

Rewrite de90fd235c015380482b52f89373da81511a277b (1/1) (0 seconds passed, remaining 0 predicted)    


In [53]:
!git add MILESTONE1.ipynb
!git commit -m "Removed secret from history"

fatal: pathspec 'MILESTONE1.ipynb' did not match any files
On branch main
nothing to commit, working tree clean


In [54]:
!git push origin main --force

fatal: 'origin' does not appear to be a git repository
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


In [58]:
!git remote add origin https://github.com/VCAM101/STREAM-ANALYTICS-GROUP-PROJECT.git

In [59]:
!git remote -v

origin	https://github.com/VCAM101/STREAM-ANALYTICS-GROUP-PROJECT.git (fetch)
origin	https://github.com/VCAM101/STREAM-ANALYTICS-GROUP-PROJECT.git (push)


In [60]:
!git remote set-url origin https://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@github.com/VCAM101/STREAM-ANALYTICS-GROUP-PROJECT.git

In [61]:
!git push origin main --force

Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 8 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 4.34 MiB | 963.00 KiB/s, done.
Total 6 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/VCAM101/STREAM-ANALYTICS-GROUP-PROJECT.git
 * [new branch]      main -> main
