# IMPORT LIBRARIES

In [1]:
from minio import Minio
from minio.error import S3Error
import pandas as pd
import matplotlib.pyplot as plt

# LOAD BUCKET

In [2]:
# Initialize MinIO client
client = Minio('localhost:9000',
                     access_key='minioadmin',
                     secret_key='minioadmin',
                     secure=False)  # Change to True if using HTTPS

bucket_name = "ecommerce"

# Example: List buckets
try:
    buckets = client.list_buckets()
    for bucket in buckets:
        print(bucket.name)
except S3Error as e:
    print("Error:", e)

try:
    objects = client.list_objects(bucket_name)
    for obj in objects:
        if obj.object_name.endswith('/'):
            print("Directory:", obj.object_name)
        else:
            print("File:", obj.object_name)
except S3Error as e:
    print("Error in listing objects:", e)

blah
climate-data
ecommerce
pythonminio
File: 2019-Dec.csv
File: 2019-Nov.csv
File: 2020-Apr.csv
File: 2020-Feb.csv
File: 2020-Mar.csv


In [4]:
import findspark
findspark.init()

In [6]:
from pyspark.sql import SparkSession
#import pyspark.errors
from pyspark.sql.functions import col, max
from delta import *

# Stop the existing SparkContext if it exists
try:
    sc.stop()
except NameError:
    pass


builder = SparkSession.builder.appName("Delta With MinIO") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Initialize Spark session
""" spark = SparkSession.builder \
    .appName("Temperature Analysis with Delta") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", True) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()
"""

FileNotFoundError: [WinError 2] The system cannot find the file specified

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS ecommerce")

# READ DATA

In [None]:
# einen test-DF erstellen, damit wir Delta ausprobieren können
from pyspark.sql import Row

# Create a list of numbers from 1 to 10
numbers = list(range(1, 11))

# Convert the list to a DataFrame
df = spark.createDataFrame([Row(number=n) for n in numbers])

# Show the DataFrame
df.show()

In [None]:
# Read CSV files from MinIO bucket
df_feb = spark.read.csv(f"s3a://{bucket_name}/2020-Feb.csv", header=True, inferSchema=True)
df_mar = spark.read.csv(f"s3a://{bucket_name}/2020-Mar.csv", header=True, inferSchema=True)
df_apr = spark.read.csv(f"s3a://{bucket_name}/2020-Apr.csv", header=True, inferSchema=True)

In [None]:
# December 2019 data
try:
    response_dec = client.get_object(bucket_name, "2019-Dec.csv")
    dec_data = pd.read_csv(response_dec)   # this data will be used in step 8
    print(dec_data.head())  # Display first few rows
except S3Error as e:
    print("Error in reading object:", e)
finally:
    if 'response_dec' in locals():
        response_dec.close()
        response_dec.release_conn()

In [None]:
# March 2020 data
try:
    response_mar = client.get_object(bucket_name, "2020-Mar.csv")
    data_mar = pd.read_csv(response_mar)   # this data will be used in step 8
    print(data_mar.head())  # Display first few rows
except S3Error as e:
    print("Error in reading object:", e)
finally:
    if 'response_mar' in locals():
        response_mar.close()
        response_mar.release_conn()

In [None]:
# April 2020 data
try:
    response_apr = client.get_object(bucket_name, "2020-Apr.csv")
    data_apr = pd.read_csv(response_apr)   # this data will be used in step 8
    print(data_apr.head())  # Display first few rows
except S3Error as e:
    print("Error in reading object:", e)
finally:
    if 'response_apr' in locals():
        response_apr.close()
        response_apr.release_conn()

In [None]:
# February 2020 data
try:
    response_feb = client.get_object(bucket_name, "2020-Feb.csv")
    data_feb = pd.read_csv(response_feb)   # this data will be used in step 8
    print(data_feb.head())  # Display first few rows
except S3Error as e:
    print("Error in reading object:", e)
finally:
    if 'response_feb' in locals():
        response_feb.close()
        response_feb.release_conn()

In [None]:
print("Total number of rows in February 2020 data:", len(data_feb))
print("Total number of rows in March 2020 data:", len(data_mar))
print("Total number of rows in April 2020 data:", len(data_apr))

# EXPLORE THE DATA

In [None]:
df = pd.DataFrame(data_mar)

# Filter numeric columns (integers or floats)
numeric_columns = df.select_dtypes(include=['int', 'float']).columns

# Create boxplot for each numeric column
plt.figure(figsize=(10, 6))
for column in numeric_columns:
    plt.boxplot(df[column], labels=[column])
    plt.title('Boxplot of Numeric Columns')
    plt.ylabel('Values')
    plt.xlabel('Columns')
    plt.xticks(ticks=[1], labels=[column])  # Set column name as x-axis label
    plt.show()



In [None]:
df = pd.DataFrame(data_apr)

# Filter numeric columns (integers or floats)
numeric_columns = df.select_dtypes(include=['int', 'float']).columns

# Create boxplot for each numeric column
plt.figure(figsize=(10, 6))
for column in numeric_columns:
    plt.boxplot(df[column], labels=[column])
    plt.title('Boxplot of Numeric Columns')
    plt.ylabel('Values')
    plt.xlabel('Columns')
    plt.xticks(ticks=[1], labels=[column])  # Set column name as x-axis label
    plt.show()

In [None]:
df = pd.DataFrame(data_feb)

# Filter numeric columns (integers or floats)
numeric_columns = df.select_dtypes(include=['int', 'float']).columns

# Create boxplot for each numeric column
plt.figure(figsize=(10, 6))
for column in numeric_columns:
    plt.boxplot(df[column], labels=[column])
    plt.title('Boxplot of Numeric Columns')
    plt.ylabel('Values')
    plt.xlabel('Columns')
    plt.xticks(ticks=[1], labels=[column])  # Set column name as x-axis label
    plt.show()


# there is an outlier in the product_id column

In [None]:
print(data_feb.info())
print(data_mar.info())
print(data_apr.info())

print(data_feb.isnull().sum())
print(data_mar.isnull().sum())
print(data_apr.isnull().sum())

In [None]:
print(f"Feb Data has {data_feb["category_code"].value_counts().sum()} rows, in which {data_feb["category_code"].isnull().sum()} of the data is null. Which means the non null data is {data_feb["category_code"].value_counts().sum() - data_feb["category_code"].isnull().sum()} rows or {((data_feb["category_code"].value_counts().sum() - data_feb["category_code"].isnull().sum())/data_feb["category_code"].value_counts().sum())*100}% of the data")
print(f"Mar Data has {data_mar["category_code"].value_counts().sum()} rows, in which {data_mar["category_code"].isnull().sum()} of the data is null. Which means the non null data is {data_mar["category_code"].value_counts().sum() - data_mar["category_code"].isnull().sum()} rows or {((data_mar["category_code"].value_counts().sum() - data_mar["category_code"].isnull().sum())/data_mar["category_code"].value_counts().sum())*100}% of the data")
print(f"Apr Data has {data_apr["category_code"].value_counts().sum()} rows, in which {data_apr["category_code"].isnull().sum()} of the data is null. Which means the non null data is {data_apr["category_code"].value_counts().sum() - data_apr["category_code"].isnull().sum()} rows or {((data_apr["category_code"].value_counts().sum() - data_apr["category_code"].isnull().sum())/data_apr["category_code"].value_counts().sum())*100}% of the data")

In [None]:
data_feb["event_time"] = pd.to_datetime(data_feb["event_time"])
data_mar["event_time"] = pd.to_datetime(data_mar["event_time"])
data_apr["event_time"] = pd.to_datetime(data_apr["event_time"])

data_feb["event_type"] = data_feb["event_type"].astype('category')	
data_mar["event_type"] = data_mar["event_type"].astype('category')
data_apr["event_type"] = data_apr["event_type"].astype('category')

data_feb["category_code"] = data_feb["category_code"].astype('category')
data_mar["category_code"] = data_mar["category_code"].astype('category')
data_apr["category_code"] = data_apr["category_code"].astype('category')



print(data_feb.info())
print(data_mar.info())
print(data_apr.info())

In [None]:
print(data_feb.describe())
print(data_mar.describe())
print(data_apr.describe())

# Ratio of event types
print(data_feb["event_type"].value_counts(normalize=True))	
print(data_mar["event_type"].value_counts(normalize=True))
print(data_apr["event_type"].value_counts(normalize=True))

# Ratio of category codes
print(data_feb["category_code"].value_counts(normalize=True))
print(data_mar["category_code"].value_counts(normalize=True))
print(data_apr["category_code"].value_counts(normalize=True))

In [None]:
(data_feb["event_type"] == "purchase").sum()


In [None]:
print(data_feb.groupby("event_type").describe())

In [None]:
df = pd.DataFrame(data_mar)

# Calculate time intervals
df['time_diff'] = df['event_time'].diff().fillna(pd.Timedelta(seconds=0))

# Set threshold for outlier detection (e.g., 2 standard deviations from mean)
threshold = df['time_diff'].mean() + 2 * df['time_diff'].std()

# Find potential outliers (timestamps with intervals beyond threshold)
outliers = df[df['time_diff'] > threshold]

# Display potential outliers
print("Potential Outliers:")
print(outliers)

In [None]:
import pandas as pd

# Sample DataFrame (replace this with your dataset)
data = {
    'timestamp': [
        '2023-01-01 08:00:00', '2023-01-01 08:30:00', '2023-01-01 09:00:00',
        '2023-01-01 09:30:00', '2023-01-01 10:00:00', '2023-01-01 10:30:00'
    ]
}
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'])  # Convert to datetime

# Calculate time intervals
df['time_diff'] = df['timestamp'].diff().fillna(pd.Timedelta(seconds=0))

# Set threshold for outlier detection (e.g., 2 standard deviations from mean)
threshold = df['time_diff'].mean() + 2 * df['time_diff'].std()

# Find potential outliers (timestamps with intervals beyond threshold)
outliers = df[df['time_diff'] > threshold]

# Display potential outliers
print("Potential Outliers:")
print(outliers)


# UPLOAD CLEANED DATA TO ANOTHER BUCKET TO BE USED FOR THE NEXT STEP

In [None]:
from pyspark.sql import SparkSession

# Assuming your cleaned data is stored in a dictionary
cleaned_data = {
    "col1": [1, 2, 3],
    "col2": ["A", "B", "C"]
}

# Create a SparkSession
spark = SparkSession.builder.appName("Example").getOrCreate()

# Convert dictionary to DataFrame
cleaned_data_df = spark.createDataFrame([(k, v) for k, v in cleaned_data.items()], ["col_name", "col_data"])

# Save DataFrame to MinIO bucket as CSV
bucket_name = "your_bucket_name"  # Replace with your bucket name
minio_endpoint = "minio_endpoint"  # Replace with your MinIO endpoint

cleaned_data_df.write.format("csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save(f"s3a://{minio_endpoint}/{bucket_name}/cleaned_data.csv")
