In [1]:
# Big Data Analytics [CN7031] CRWK 2024-25
# Group ID: CN7031_Group136_2024

# Student Information:
# 1. Student 1: Navya Athoti u2793047@uel.ac.uk
# 2. Student 2: Phalguna Avalagunta u2811669@uel.ac.uk
# 3. Student 3: Nikhil Sai Damera u2810262@uel.ac.uk
# 4. Student 4: Sai Kishore Dodda u2773584@uel.ac.uk

# Import required libraries
import os

import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    regexp_extract, col, window, count, avg, sum, 
    unix_timestamp, hour, date_format, countDistinct
)
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql.functions import max as spark_max # Avoid conflict with Python's max function
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime

# Initialize Spark session with optimized configuration
def initialize_spark():
    conf = SparkConf().setAppName('CN7031_Group136_2024') \
        .set("spark.driver.memory", "4g") \
        .set("spark.executor.memory", "4g") \
        .set("spark.sql.shuffle.partitions", "100")
    
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)
    return sc, spark

sc, spark = initialize_spark()

# Load and validate the dataset
def load_data(spark, path="web.log"):
    try:
        data = spark.read.text(path)
        print(f"Successfully loaded {data.count()} log entries")
        return data
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

data = load_data(spark, path="web.log")

# Task 1: Data Processing using PySpark DF [40 marks]

# Student 1 (Navya Athoti u2793047)
print("\nStudent 1 Analysis - Web Traffic Pattern Analysis")
print("=" * 50)

# DF Creation with REGEX (10 marks)
regex_student1 = r"(\d+\.\d+\.\d+\.\d+) - - \[(.*?)\] \"([A-Z]+)"
df_student1 = data.select(
    regexp_extract('value', regex_student1, 1).alias('IP_Address'),
    regexp_extract('value', regex_student1, 2).alias('Timestamp'),
    regexp_extract('value', regex_student1, 3).alias('HTTP_Method')
).cache()  # Cache for performance

# Validate extracted data
print("\nData Quality Check:")
print(f"Total Records: {df_student1.count()}")
print(f"Null Values: {df_student1.filter(col('IP_Address') == '').count()}")

# Advanced Analysis 1: Rolling Window Analysis (10 marks)
windowed_traffic = df_student1 \
    .withColumn('timestamp', unix_timestamp('Timestamp', 'dd/MMM/yyyy:HH:mm:ss').cast('timestamp')) \
    .withWatermark('timestamp', '1 hour') \
    .groupBy(
        window('timestamp', '1 hour'),
        'IP_Address'
    ).agg(
        count('*').alias('request_count')
    ).orderBy('window.start')

print("\nHourly Traffic Pattern Sample:")
windowed_traffic.show(5)

# Advanced Analysis 2: HTTP Method Distribution (10 marks)
method_distribution = df_student1 \
    .groupBy('HTTP_Method') \
    .agg(
        count('*').alias('total_requests'),
        countDistinct('IP_Address').alias('unique_ips')
    ).orderBy(col('total_requests').desc())

print("\nHTTP Method Distribution:")
method_distribution.show()

# Visualization (10 marks)
# For Student 1's visualization
def create_traffic_visualization(df):
    # Convert to pandas and prepare data
    df_pandas = df.toPandas()
    
    # Convert window struct to datetime
    df_pandas['time'] = df_pandas['window'].apply(lambda x: x.start)
    
    plt.figure(figsize=(12, 6))
    
    # Create time series plot with proper column names
    sns.lineplot(data=df_pandas, 
                x='time', 
                y='request_count',
                marker='o')
    
    plt.title('Hourly Web Traffic Pattern')
    plt.xlabel('Time')
    plt.ylabel('Request Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save visualization
    plt.savefig('student1_analysis.png')
    plt.close()

# Modify the windowed traffic query
windowed_traffic = df_student1 \
    .withColumn('timestamp', unix_timestamp('Timestamp', 'dd/MMM/yyyy:HH:mm:ss').cast('timestamp')) \
    .groupBy(
        window('timestamp', '1 hour')
    ).agg(
        count('*').alias('request_count')
    ).orderBy('window')

# Create visualization
create_traffic_visualization(windowed_traffic)

# Student 2 (Phalguna Avalagunta u2811669)
print("\nStudent 2 Analysis - Response Analysis")
print("=" * 50)

# DF Creation with REGEX (10 marks)
regex_student2 = r"\".*\" (\d+) (\d+) \[(.*?)\]"
df_student2 = data.select(
    regexp_extract('value', regex_student2, 1).alias('Status_Code'),
    regexp_extract('value', regex_student2, 2).alias('Response_Size'),
    regexp_extract('value', regex_student2, 3).alias('Timestamp')
).cache()

# Student 3 (Nikhil Sai Damera u2810262)
print("\nStudent 3 Analysis - URL Pattern Analysis")
print("=" * 50)
# DF Creation with REGEX (10 marks)
regex_student3 = r"\"[A-Z]+ (\/.*?) HTTP.* (\d+\.\d+\.\d+\.\d+) (\d+)"
df_student3 = data.select(
    regexp_extract('value', regex_student3, 1).alias('URL_Path'),
    regexp_extract('value', regex_student3, 2).alias('IP_Address'),
    regexp_extract('value', regex_student3, 3).alias('Response_Size')
).cache()
# Verify DataFrame creation
print("\nVerifying Student 3 DataFrame structure:")
df_student3.printSchema()
print("\nSample data:")
df_student3.show(5)

# Student 4 (Sai Kishore Dodda u2773584)
print("\nStudent 4 Analysis - Log Message Analysis")
print("=" * 50)
# DF Creation with REGEX (10 marks)
regex_student4 = r"\".*\" (\d+) .*? \[(.*?)\] (.*)"
df_student4 = data.select(
    regexp_extract('value', regex_student4, 1).alias('HTTP_Status_Code'),
    regexp_extract('value', regex_student4, 2).alias('Timestamp'),
    regexp_extract('value', regex_student4, 3).alias('Log_Message')
).cache()
# Verify DataFrame creation
print("\nVerifying Student 4 DataFrame structure:")
df_student4.printSchema()
print("\nSample data:")
df_student4.show(5)

# Advanced Analysis 1: Session Analysis (10 marks)
session_analysis = df_student2 \
    .withColumn('timestamp', unix_timestamp('Timestamp', 'dd/MMM/yyyy:HH:mm:ss').cast('long')) \
    .withColumn(
        'session_requests',
        count('*').over(
            Window.orderBy('timestamp')
            .rangeBetween(-1800, 0)  # 30-minute window in seconds
        )
    ) \
    .withColumn(
        'avg_response_size',
        avg('Response_Size').over(
            Window.orderBy('timestamp')
            .rangeBetween(-1800, 0)
        )
    )

print("\nSession Analysis Sample:")
session_analysis.select('timestamp', 'session_requests', 'avg_response_size').show(5)

# Advanced Analysis 2: Response Size Analysis (10 marks)
response_analysis = df_student2 \
    .groupBy('Status_Code') \
    .agg(
        count('*').alias('request_count'),
        avg('Response_Size').alias('avg_response_size'),
        spark_max('Response_Size').alias('max_response_size')  # Use spark_max instead of max
    ).orderBy('Status_Code')

print("\nResponse Size Analysis:")
response_analysis.show()

# Visualization (10 marks)
def create_response_visualization(df):
    # Convert to pandas
    df_pandas = df.toPandas()
    
    # Convert Status_Code to string for better plotting
    df_pandas['Status_Code'] = df_pandas['Status_Code'].astype(str)
    
    plt.figure(figsize=(12, 6))
    
    # Create bar plot
    sns.barplot(
        data=df_pandas,
        x='Status_Code',
        y='avg_response_size',
        palette='viridis'
    )
    
    plt.title('Average Response Size by Status Code')
    plt.xlabel('HTTP Status Code')
    plt.ylabel('Average Response Size (bytes)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    # Save visualization
    plt.savefig('student2_analysis.png')
    plt.close()

create_response_visualization(response_analysis)

# Task 2: Data Processing using PySpark RDD [40 marks]

# Student 1 (Navya Athoti u2793047)
print("\nStudent 1 RDD Analysis - Traffic Pattern Mining")
print("=" * 50)

# Basic RDD Analysis: Parse and Extract (10 marks)
def parse_log_entry(line):
    import re
    try:
        pattern = r'(\d+\.\d+\.\d+\.\d+).*\[(.*?)\].*\"([A-Z]+)'
        match = re.search(pattern, line)
        if match:
            return {
                'ip': match.group(1),
                'timestamp': match.group(2),
                'method': match.group(3)
            }
    except Exception as e:
        print(f"Parsing error: {str(e)}")
    return None

base_rdd = data.rdd.map(lambda x: x['value']) \
                   .map(parse_log_entry) \
                   .filter(lambda x: x is not None)

# Advanced Analysis 1: Time-based Traffic Analysis (15 marks)
hourly_traffic = base_rdd \
    .map(lambda x: (x['timestamp'][:13], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortByKey()

print("\nHourly Traffic Sample:")
for hour, count in hourly_traffic.take(5):
    print(f"{hour}: {count} requests")

# Advanced Analysis 2: IP-based Pattern Analysis (15 marks)
ip_patterns = base_rdd \
    .map(lambda x: (x['ip'], x['method'])) \
    .groupByKey() \
    .mapValues(lambda methods: {
        'total_requests': len(list(methods)),
        'method_distribution': dict(pd.Series(list(methods)).value_counts())
    })

print("\nIP Pattern Analysis Sample:")
for ip, stats in ip_patterns.take(3):
    print(f"\nIP: {ip}")
    print(f"Total Requests: {stats['total_requests']}")
    print("Method Distribution:", stats['method_distribution'])


# Task 3: Optimization and LSEPI Considerations [10 marks]

# Student 1 (Navya Athoti u2793047)
print("\nStudent 1 Optimization Analysis")
print("=" * 50)

# Method 1: Partition Strategies (5 marks)
def evaluate_partition_strategy():
    print("\nPartitioning Strategy Evaluation")
    
    # Baseline - Default partitioning
    start_time = time.time()
    df_student1.groupBy('IP_Address').count().count()
    baseline_time = time.time() - start_time
    print(f"Baseline execution time: {baseline_time:.2f} seconds")
    
    # Custom partitioning
    start_time = time.time()
    df_student1.repartition(8, 'IP_Address').groupBy('IP_Address').count().count()
    optimized_time = time.time() - start_time
    print(f"Optimized execution time: {optimized_time:.2f} seconds")
    print(f"Performance improvement: {((baseline_time - optimized_time) / baseline_time) * 100:.2f}%")

evaluate_partition_strategy()

# Method 2: Caching Strategy (5 marks)
def evaluate_caching_strategy():
    print("\nCaching Strategy Evaluation")
    
    # Without caching
    df_uncached = df_student1.unpersist()
    start_time = time.time()
    df_uncached.groupBy('HTTP_Method').count().count()
    df_uncached.groupBy('IP_Address').count().count()
    uncached_time = time.time() - start_time
    print(f"Uncached execution time: {uncached_time:.2f} seconds")
    
    # With caching
    df_cached = df_student1.cache()
    df_cached.count()  # Materialize cache
    start_time = time.time()
    df_cached.groupBy('HTTP_Method').count().count()
    df_cached.groupBy('IP_Address').count().count()
    cached_time = time.time() - start_time
    print(f"Cached execution time: {cached_time:.2f} seconds")
    print(f"Caching improvement: {((uncached_time - cached_time) / uncached_time) * 100:.2f}%")

evaluate_caching_strategy()

# Continue with Students 2-4 Task 3 implementations

# Student 2 (Phalguna Avalagunta u2811669)
print("\nStudent 2 Optimization Analysis")
print("=" * 50)

# Method 1: Caching Strategy
def evaluate_caching_strategy_student2():
    print("\nCaching Strategy Evaluation")
    
    # Without caching
    df_uncached = df_student2.unpersist()
    start_time = time.time()
    df_uncached.groupBy('Status_Code').count().count()
    df_uncached.groupBy('Response_Size').count().count()
    uncached_time = time.time() - start_time
    print(f"Uncached execution time: {uncached_time:.2f} seconds")
    
    # With caching
    df_cached = df_student2.cache()
    df_cached.count()  # Materialize cache
    start_time = time.time()
    df_cached.groupBy('Status_Code').count().count()
    df_cached.groupBy('Response_Size').count().count()
    cached_time = time.time() - start_time
    print(f"Cached execution time: {cached_time:.2f} seconds")
    print(f"Caching improvement: {((uncached_time - cached_time) / uncached_time) * 100:.2f}%")

evaluate_caching_strategy_student2()

# Method 2: Bucketing & Indexing
def evaluate_bucketing_strategy_student2():
    print("\nBucketing Strategy Evaluation")
    
    # Create temporary view
    df_student2.createOrReplaceTempView("logs")
    
    # Create bucketed table
    spark.sql("""
    CREATE TABLE IF NOT EXISTS bucketed_logs
    USING parquet
    CLUSTERED BY (Status_Code) INTO 4 BUCKETS
    AS SELECT * FROM logs
    """)
    
    # Measure query performance
    start_time = time.time()
    spark.sql("SELECT Status_Code, COUNT(*) FROM bucketed_logs GROUP BY Status_Code").show()
    bucketed_time = time.time() - start_time
    print(f"Query time with bucketing: {bucketed_time:.2f} seconds")

evaluate_bucketing_strategy_student2()

# Student 3 (Nikhil Sai Damera u2810262)
print("\nStudent 3 Optimization Analysis")
print("=" * 50)

# Method 1: Partition Strategies
def evaluate_partition_strategy_student3():
    print("\nPartitioning Strategy Evaluation")
    
    # Baseline
    start_time = time.time()
    df_student3.groupBy('URL_Path').count().count()
    baseline_time = time.time() - start_time
    print(f"Baseline execution time: {baseline_time:.2f} seconds")
    
    # Custom partitioning
    start_time = time.time()
    df_student3.repartition(10, 'URL_Path').groupBy('URL_Path').count().count()
    optimized_time = time.time() - start_time
    print(f"Optimized execution time: {optimized_time:.2f} seconds")
    print(f"Performance improvement: {((baseline_time - optimized_time) / baseline_time) * 100:.2f}%")

evaluate_partition_strategy_student3()

# Method 2: Bucketing & Indexing
def evaluate_bucketing_strategy_student3():
    print("\nBucketing Strategy Evaluation")
    df_student3.createOrReplaceTempView("url_logs")
    
    spark.sql("""
    CREATE TABLE IF NOT EXISTS bucketed_url_logs
    USING parquet
    CLUSTERED BY (URL_Path) INTO 4 BUCKETS
    AS SELECT * FROM url_logs
    """)
    
    start_time = time.time()
    spark.sql("SELECT URL_Path, COUNT(*) FROM bucketed_url_logs GROUP BY URL_Path").show()
    bucketed_time = time.time() - start_time
    print(f"Query time with bucketing: {bucketed_time:.2f} seconds")

evaluate_bucketing_strategy_student3()

# Student 4 (Sai Kishore Dodda u2773584)
print("\nStudent 4 Optimization Analysis")
print("=" * 50)

# Method 1: Caching Strategy
def evaluate_caching_strategy_student4():
    print("\nCaching Strategy Evaluation")
    
    # Without caching
    df_uncached = df_student4.unpersist()
    start_time = time.time()
    df_uncached.groupBy('HTTP_Status_Code').count().count()
    uncached_time = time.time() - start_time
    print(f"Uncached execution time: {uncached_time:.2f} seconds")
    
    # With caching
    df_cached = df_student4.cache()
    df_cached.count()  # Materialize cache
    start_time = time.time()
    df_cached.groupBy('HTTP_Status_Code').count().count()
    cached_time = time.time() - start_time
    print(f"Cached execution time: {cached_time:.2f} seconds")
    print(f"Caching improvement: {((uncached_time - cached_time) / uncached_time) * 100:.2f}%")

evaluate_caching_strategy_student4()

# Method 2: Partition Strategies
def evaluate_partition_strategy_student4():
    print("\nPartitioning Strategy Evaluation")
    
    # Baseline
    start_time = time.time()
    df_student4.groupBy('HTTP_Status_Code').count().count()
    baseline_time = time.time() - start_time
    print(f"Baseline execution time: {baseline_time:.2f} seconds")
    
    # Custom partitioning
    start_time = time.time()
    df_student4.repartition(8, 'HTTP_Status_Code').groupBy('HTTP_Status_Code').count().count()
    optimized_time = time.time() - start_time
    print(f"Optimized execution time: {optimized_time:.2f} seconds")
    print(f"Performance improvement: {((baseline_time - optimized_time) / baseline_time) * 100:.2f}%")

evaluate_partition_strategy_student4()

# Clean up resources
def cleanup():
    try:
        # Drop temporary tables
        spark.sql("DROP TABLE IF EXISTS bucketed_logs")
        spark.sql("DROP TABLE IF EXISTS bucketed_url_logs")
        
        # Stop Spark session
        sc.stop()
        print("\nSpark session successfully closed")
    except Exception as e:
        print(f"Error during cleanup: {str(e)}")

cleanup()

Successfully loaded 3000000 log entries

Student 1 Analysis - Web Traffic Pattern Analysis

Data Quality Check:
Total Records: 3000000
Null Values: 0

Hourly Traffic Pattern Sample:
+--------------------+--------------+-------------+
|              window|    IP_Address|request_count|
+--------------------+--------------+-------------+
|{2022-01-01 00:00...| 169.29.157.48|            1|
|{2022-01-01 00:00...|213.11.169.161|            1|
|{2022-01-01 00:00...|   95.4.86.192|            1|
|{2022-01-01 00:00...|  207.75.52.45|            1|
|{2022-01-01 00:00...| 38.254.36.191|            1|
+--------------------+--------------+-------------+
only showing top 5 rows


HTTP Method Distribution:
+-----------+--------------+----------+
|HTTP_Method|total_requests|unique_ips|
+-----------+--------------+----------+
|        GET|       1001043|   1000932|
|       POST|       1000505|   1000390|
|        PUT|        998452|    998328|
+-----------+--------------+----------+



  with pd.option_context('mode.use_inf_as_na', True):
  with pd.option_context('mode.use_inf_as_na', True):



Student 2 Analysis - Response Analysis

Student 3 Analysis - URL Pattern Analysis

Verifying Student 3 DataFrame structure:
root
 |-- URL_Path: string (nullable = true)
 |-- IP_Address: string (nullable = true)
 |-- Response_Size: string (nullable = true)


Sample data:
+--------+----------+-------------+
|URL_Path|IP_Address|Response_Size|
+--------+----------+-------------+
|        |          |             |
|        |          |             |
|        |          |             |
|        |          |             |
|        |          |             |
+--------+----------+-------------+
only showing top 5 rows


Student 4 Analysis - Log Message Analysis

Verifying Student 4 DataFrame structure:
root
 |-- HTTP_Status_Code: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Log_Message: string (nullable = true)


Sample data:
+----------------+---------+-----------+
|HTTP_Status_Code|Timestamp|Log_Message|
+----------------+---------+-----------+
|                | 

Py4JJavaError: An error occurred while calling o30.sql.
: java.lang.UnsatisfiedLinkError: 'boolean org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(java.lang.String, int)'
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.validateTableLocation(SessionCatalog.scala:419)
	at org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand.run(createDataSourceTables.scala:176)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.Dataset.<init>(Dataset.scala:220)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:638)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)


: 

: 