In [0]:
import boto3
import pandas as pd
from io import StringIO
import time
import requests
import os

In [0]:

file_id = "1abe9EkM_uf2F2hjEkbhMBG9Mf2dFE4Wo"
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"

dbfs_path = "/dbfs/FileStore/tables/"
file_path = os.path.join(dbfs_path, "transactions.csv")

# Ensure the directory exists
os.makedirs(dbfs_path, exist_ok=True)

# Download the file and write it
response = requests.get(download_url)
with open(file_path, "wb") as f:
    f.write(response.content)

print(f"File downloaded to {file_path}")

In [0]:
df_transactions  = spark.read.csv('/FileStore/tables/transactions.csv', header = True) 
df_transactions.show(5)

In [0]:
from pyspark.sql.functions import col, regexp_replace

df_transactions = (
    df_transactions
    .withColumn("step", regexp_replace(col("step"), "'", "").cast("int"))
    .withColumn("amount", regexp_replace(col("amount"), "'", "").cast("double"))
    .withColumn("fraud", regexp_replace(col("fraud"), "'", "").cast("int"))
    .withColumn("age", regexp_replace(col("age"), "'", "").cast("int"))
    .withColumn("customer", regexp_replace(col("customer"), "'", ""))
    .withColumn("gender", regexp_replace(col("gender"), "'", ""))
    .withColumn("zipcodeOri", regexp_replace(col("zipcodeOri"), "'", ""))
    .withColumn("merchant", regexp_replace(col("merchant"), "'", ""))
    .withColumn("zipMerchant", regexp_replace(col("zipMerchant"), "'", ""))
    .withColumn("category", regexp_replace(col("category"), "'", ""))
)

df_transactions.show(5)

In [0]:
df_transactions.printSchema()

In [0]:
import os

os.environ['AWS_ACCESS_KEY_ID'] = '***************'
os.environ['AWS_SECRET_ACCESS_KEY'] = '**************'

In [0]:
import boto3

session = boto3.Session(region_name = 'ap-south-1',aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
                    aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'))
print('successfully aws session created')

In [0]:
import logging
import boto3
from botocore.exceptions import ClientError


def create_bucket(bucket_name, region="ap-south-1"):
    """Create an S3 bucket in a specified region

    If a region is not specified, the bucket is created in the S3 default
    region (us-east-1).

    :param bucket_name: Bucket to create
    :param region: String region to create bucket in, e.g., 'us-west-2'
    :return: True if bucket created, else False
    """

    # Create bucket
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [0]:
import boto3
s3 = boto3.client('s3')
response = s3.list_buckets()

existing_buckets = []
for bucket in response['Buckets']:
    existing_buckets.append(bucket["Name"])

if "pattern-detection-pyspark" not in existing_buckets: 
    create_bucket("pattern-detection-pyspark")


In [0]:
import boto3

# Set up the S3 client
s3 = boto3.client('s3')

# Define the bucket and folder name
bucket_name = 'pattern-detection-pyspark'
folder_name = 'streaming/input/'

# Create the folder (prefix)
response = s3.put_object(Bucket=bucket_name, Key=folder_name)

# Print the response
print(response)

In [0]:
aws_access_key = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
bucket_name = "pattern-detection-pyspark"
s3_folder = "streaming/input/"

spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window

window_spec = Window.orderBy(monotonically_increasing_id())
df_transactions_id = df_transactions.withColumn("row_id", row_number().over(window_spec))

In [0]:
df_transactions_id = df_transactions_id.limit(1000) 
df_transactions_id.count()

In [0]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window
import time


window_spec = Window.orderBy(monotonically_increasing_id())
df_with_id = df_transactions_id.withColumn("row_id", row_number().over(window_spec))


chunk_size = 200
total_rows = df_with_id.count()
total_chunks = (total_rows + chunk_size - 1) // chunk_size

for chunk_id in range(total_chunks): 
    lower = chunk_id * chunk_size + 1
    upper = lower + chunk_size

    df_chunk = (
        df_with_id.filter((df_with_id.row_id >= lower) & (df_with_id.row_id < upper)).drop("row_id")
    )

    output_path = f"s3a://{bucket_name}/{s3_folder}chunk_{chunk_id}.csv"

    df_chunk.coalesce(1) \
        .write \
        .mode("overwrite") \
        .option("header", True) \
        .csv(output_path)

    print(f" Uploaded chunk_{chunk_id}.csv to {output_path}")
    time.sleep(1)  
