In [37]:
# Intialization (re-run these if you are in a new session)
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# IMPORTANT: If you want to use your miniconda python (recommended for this approach),
#             comment out these two lines.
#             Otherwise, ensure xgboost and scikit-learn are installed for /usr/bin/python3.6
# os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6"
# os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"

sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# We no longer need the xgboost4j-spark package if SparkXGBClassifier is not used.
# If you keep it, it won't hurt, but it's not strictly necessary for this approach.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# If you remove xgboost4j-spark, remember to also adjust this line.
# For simplicity, keeping it for now, as it handles other potential Spark dependencies.
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3,ml.dmlc:xgboost4j-spark_2.11:1.7.0 pyspark-shell'


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, trim, datediff, to_date, unix_timestamp, count, avg
from pyspark.ml import Pipeline, PipelineModel





# Create a SparkSession
# Create a SparkSession
# FOR TESTING PURPOSES ONLY - simplify to get it to start
spark = SparkSession.builder \
    .appName("LungCancerModelTrainingWithScikitLearn") \
    .config("spark.driver.memory", "7g") \
    .getOrCreate()

# --- Re-load or re-run preprocessing to get final_ml_df ---
print("\n--- Re-loading/Re-running Preprocessing to obtain `final_ml_df` ---")
file_path = "file:///home/talentum/shared/Project/lung_cancer_project/Lung_Cancer.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Initial Data Type Conversions
df = df.withColumn("diagnosis_date", col("diagnosis_date").cast("date"))
df = df.withColumn("end_treatment_date", col("end_treatment_date").cast("date"))
boolean_like_cols = ["hypertension", "asthma", "cirrhosis", "other_cancer", "survived"]
for col_name in boolean_like_cols:
    df = df.withColumn(col_name, col(col_name).cast("int"))

# Re-verify Missing Values and Impute
missing_values_check = {}
for column in df.columns:
    if df.schema[column].dataType.typeName() == "string":
        missing_count = df.filter(col(column).isNull() | (trim(col(column)) == "")).count()
    else:
        missing_count = df.filter(col(column).isNull()).count()
    if missing_count > 0:
        missing_values_check[column] = column
if missing_values_check:
    print("WARNING: Missing values detected during re-load. Imputing:")
    numerical_cols_for_imputation = ["age", "bmi", "cholesterol_level"]
    for col_name in numerical_cols_for_imputation:
        if col_name in df.columns and col_name in missing_values_check:
            mean_value = df.agg(avg(col_name)).collect()[0][0]
            if mean_value is not None:
                df = df.fillna({col_name: mean_value})
    categorical_cols_for_imputation = ["gender", "country", "cancer_stage", "family_history", "smoking_status", "treatment_type"]
    for col_name in categorical_cols_for_imputation:
        if col_name in df.columns and col_name in missing_values_check:
            mode_value_row = df.groupBy(col_name).count().orderBy(col("count").desc()).first()
            if mode_value_row:
                mode_value = mode_value_row[0]
                if mode_value is not None:
                    df = df.withColumn(col_name, when(col(col_name).isNull() | (trim(col(col_name)) == ""), lit(mode_value)).otherwise(col(col_name)))
else:
    print("No missing values detected. Proceeding.")

# Feature Engineering
df = df.withColumn("treatment_duration_days", datediff(col("end_treatment_date"), col("diagnosis_date")))
df = df.withColumn("treatment_duration_days", when(col("treatment_duration_days").isNull(), lit(None)).when(col("treatment_duration_days") < 0, 0).otherwise(col("treatment_duration_days")))
missing_duration_count = df.filter(col("treatment_duration_days").isNull()).count()
if missing_duration_count > 0:
    mean_duration = df.agg(avg("treatment_duration_days")).collect()[0][0]
    if mean_duration is not None:
        df = df.fillna({"treatment_duration_days": mean_duration})
df = df.withColumn("age_group", when((col("age") >= 18) & (col("age") <= 30), "18-30").when((col("age") > 30) & (col("age") <= 45), "31-45").when((col("age") > 45) & (col("age") <= 60), "46-60").when(col("age") > 60, "61+").otherwise("Unknown"))
df = df.withColumn("bmi_category", when(col("bmi") < 18.5, "Underweight").when((col("bmi") >= 18.5) & (col("bmi") < 25), "Normal weight").when((col("bmi") >= 25) & (col("bmi") < 30), "Overweight").when(col("bmi") >= 30, "Obese").otherwise("Unknown"))
df = df.withColumn("has_comorbidity", when((col("hypertension") == 1) | (col("asthma") == 1) | (col("cirrhosis") == 1) | (col("other_cancer") == 1), 1).otherwise(0))

# Load the Preprocessing Pipeline
pipeline_save_path = "/home/talentum/shared/Project/lung_cancer_project/model_assets/preprocessing_pipeline"
try:
    pipeline_model = PipelineModel.load(pipeline_save_path)
    processed_df = pipeline_model.transform(df)
    final_ml_df = processed_df.select(col("features_scaled").alias("features"), col("survived").alias("label"))
    print("\nSuccessfully loaded and applied preprocessing pipeline to get `final_ml_df`.")
    print("Schema of `final_ml_df`:")
    final_ml_df.printSchema()
except Exception as e:
    print(f"\nERROR: Could not load or apply preprocessing pipeline from {pipeline_save_path}. {e}")
    spark.stop()
    sys.exit(1)


'''

# ... (your existing code for creating final_ml_df) ...

print("\n--- Saving `final_ml_df` to a single CSV file ---")

# Define the output path for your CSV file
# It's good practice to provide an absolute path to avoid ambiguity
output_csv_path = "file:///home/talentum/shared/Project/lung_cancer_project/processed_data/final_ml_df_single_file.csv"

try:
    # Coalesce to 1 partition to get a single CSV file
    # Write the DataFrame as CSV with header and overwrite mode
    final_ml_df.coalesce(1).write.csv(output_csv_path, header=True, mode="overwrite")
    print(f"Successfully saved `final_ml_df` to {output_csv_path} as a single CSV.")
except Exception as e:
    print(f"ERROR: Could not save `final_ml_df` to CSV: {e}")

# ... (your existing code for converting to Pandas DataFrame, if still needed) ...

# Stop the Spark session cleanly at the very end
spark.stop()
print("Spark session stopped.")
'''

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:37139)

In [35]:
# --- NEW IMPORTS for scikit-learn and Python XGBoost ---
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # Support Vector Classifier (equivalent to LinearSVC for binary, but more general)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier # Python XGBoost
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, average_precision_score
import joblib # For saving scikit-learn models
# --- Step 11: Data Splitting (Scikit-learn) ---
print("\n--- Step 11: Data Splitting (Scikit-learn) ---")ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in _get_connection(self)
    928         try:
--> 929             connection = self.deque.pop()
    930         except IndexError:

IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

ConnectionRefusedError                    Traceback (most recent call last)
~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in start(self)
   1066         try:
-> 1067             self.socket.connect((self.address, self.port))
   1068             self.stream = self.socket.makefile("rb")

ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Py4JNetworkError                          Traceback (most recent call last)
<ipython-input-25-bb54d2ffea1f> in <module>
     31 spark = SparkSession.builder \
     32     .appName("LungCancerModelTrainingWithScikitLearn") \
---> 33     .config("spark.driver.memory", "8g") \
     34     .getOrCreate()
     35 

~/spark/python/lib/pyspark.zip/pyspark/sql/session.py in getOrCreate(self)
    181                     session = SparkSession(sc)
    182                 for key, value in self._options.items():
--> 183                     session._jsparkSession.sessionState().conf().setConfString(key, value)
    184                 for key, value in self._options.items():
    185                     session.sparkContext._conf.set(key, value)

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1253             proto.END_COMMAND_PART
   1254 
-> 1255         answer = self.gateway_client.send_command(command)
   1256         return_value = get_return_value(
   1257             answer, self.gateway_client, self.target_id, self.name)

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in send_command(self, command, retry, binary)
    981          if `binary` is `True`.
    982         """
--> 983         connection = self._get_connection()
    984         try:
    985             response = connection.send_command(command)

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in _get_connection(self)
    929             connection = self.deque.pop()
    930         except IndexError:
--> 931             connection = self._create_connection()
    932         return connection
    933 

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in _create_connection(self)
    935         connection = GatewayConnection(
    936             self.gateway_parameters, self.gateway_property)
--> 937         connection.start()
    938         return connection
    939 

~/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in start(self)
   1077                 "server ({0}:{1})".format(self.address, self.port)
   1078             logger.exception(msg)
-> 1079             raise Py4JNetworkError(msg, e)
   1080 
   1081     def _authenticate_connection(self):

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:37139)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Data Count: {X_train.shape[0]}")
print(f"Test Data Count: {X_test.shape[0]}")


# --- Step 12: Model Training with Grid Search and Cross-Validation (Scikit-learn) ---
print("\n--- Step 12: Model Training with Grid Search and Cross-Validation (Scikit-learn) ---")

best_models = {}
best_metrics = {}

# --- 12.1: Logistic Regression with Grid Search ---
print("\n  Configuring Logistic Regression with Grid Search...")
lr = LogisticRegression(random_state=42, solver='liblinear') # 'liblinear' is good for small datasets, supports L1/L2

lr_paramGrid = {
    'C': [0.1, 1.0, 10.0], # Inverse of regularization strength
    'penalty': ['l1', 'l2']
}

lr_grid_search = GridSearchCV(lr, lr_paramGrid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1) # n_jobs=-1 uses all available cores

print("  Fitting Logistic Regression GridSearchCV...")
lr_grid_search.fit(X_train, y_train)
best_lr_model = lr_grid_search.best_estimator_
best_models["Logistic Regression"] = best_lr_model
lr_test_predictions_proba = best_lr_model.predict_proba(X_test)[:, 1] # Probability of the positive class
best_lr_metric = roc_auc_score(y_test, lr_test_predictions_proba)
best_metrics["Logistic Regression"] = best_lr_metric
print(f"  Best Logistic Regression Model AUC on test set: {best_lr_metric:.4f}")
print(f"  Best Logistic Regression Params: {lr_grid_search.best_params_}")


# --- 12.2: Support Vector Classifier (SVC) with Grid Search ---
print("\n  Configuring SVC (SVM) with Grid Search...")
# SVC is more general than LinearSVC and can use different kernels.
# For a linear SVM, use kernel='linear'. C is the regularization parameter.
svc = SVC(random_state=42, probability=True) # probability=True to get roc_auc_score

svc_paramGrid = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf'] # Explore linear and Radial Basis Function (RBF) kernels
}

svc_grid_search = GridSearchCV(svc, svc_paramGrid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

print("  Fitting SVC CrossValidator...")
svc_grid_search.fit(X_train, y_train)
best_svc_model = svc_grid_search.best_estimator_
best_models["SVC (SVM)"] = best_svc_model
svc_test_predictions_proba = best_svc_model.predict_proba(X_test)[:, 1]
best_svc_metric = roc_auc_score(y_test, svc_test_predictions_proba)
best_metrics["SVC (SVM)"] = best_svc_metric
print(f"  Best SVC (SVM) Model AUC on test set: {best_svc_metric:.4f}")
print(f"  Best SVC (SVM) Params: {svc_grid_search.best_params_}")


# --- 12.3: XGBoost with Grid Search (Python package) ---
print("\n  Configuring XGBoost with Grid Search (Python package)...")
# Note: eval_metric is passed during instantiation, not in param grid for GridSearchCV
xgb = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)

xgb_paramGrid = {
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.05],
    'subsample': [0.8, 1.0],
    'n_estimators': [50, 100] # Number of boosting rounds (trees)
}

xgb_grid_search = GridSearchCV(xgb, xgb_paramGrid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

print("  Fitting XGBoost GridSearchCV...")
xgb_grid_search.fit(X_train, y_train)
best_xgb_model = xgb_grid_search.best_estimator_
best_models["XGBoost"] = best_xgb_model
xgb_test_predictions_proba = best_xgb_model.predict_proba(X_test)[:, 1]
best_xgb_metric = roc_auc_score(y_test, xgb_test_predictions_proba)
best_metrics["XGBoost"] = best_xgb_metric
print(f"  Best XGBoost Model AUC on test set: {best_xgb_metric:.4f}")
print(f"  Best XGBoost Params: {xgb_grid_search.best_params_}")


# --- 12.4: Decision Tree Classifier with Grid Search ---
print("\n  Configuring Decision Tree Classifier with Grid Search...")
dt = DecisionTreeClassifier(random_state=42)

dt_paramGrid = {
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [1, 5, 10] # Minimum number of samples required to be at a leaf node
}

dt_grid_search = GridSearchCV(dt, dt_paramGrid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

print("  Fitting Decision Tree Classifier GridSearchCV...")
dt_grid_search.fit(X_train, y_train)
best_dt_model = dt_grid_search.best_estimator_
best_models["Decision Tree"] = best_dt_model
dt_test_predictions_proba = best_dt_model.predict_proba(X_test)[:, 1]
best_dt_metric = roc_auc_score(y_test, dt_test_predictions_proba)
best_metrics["Decision Tree"] = best_dt_metric
print(f"  Best Decision Tree Model AUC on test set: {best_dt_metric:.4f}")
print(f"  Best Decision Tree Params: {dt_grid_search.best_params_}")


# --- 12.5: Random Forest Classifier with Grid Search ---
print("\n  Configuring Random Forest Classifier with Grid Search...")
rf = RandomForestClassifier(random_state=42)

rf_paramGrid = {
    'n_estimators': [50, 100], # Number of trees in the forest
    'max_depth': [5, 10],      # Max depth of each tree
    'min_samples_leaf': [1, 5]
}

rf_grid_search = GridSearchCV(rf, rf_paramGrid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

print("  Fitting Random Forest Classifier GridSearchCV...")
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_
best_models["Random Forest"] = best_rf_model
rf_test_predictions_proba = best_rf_model.predict_proba(X_test)[:, 1]
best_rf_metric = roc_auc_score(y_test, rf_test_predictions_proba)
best_metrics["Random Forest"] = best_rf_metric
print(f"  Best Random Forest Model AUC on test set: {best_rf_metric:.4f}")
print(f"  Best Random Forest Params: {rf_grid_search.best_params_}")


# --- Step 13: Overall Best Model Selection and Final Evaluation ---
print("\n--- Step 13: Overall Best Model Selection and Final Evaluation ---")

overall_best_model_name = None
overall_best_auc = -1.0 # Initialize with a very low value

print("\nSummary of Best Model AUCs on Test Set (after cross-validation and hyperparameter tuning):")
for name, auc_score in best_metrics.items():
    print(f"- {name}: AUC = {auc_score:.4f}")
    if auc_score > overall_best_auc:
        overall_best_auc = auc_score
        overall_best_model_name = name

print(f"\nOverall Best Model is: {overall_best_model_name} with AUC: {overall_best_auc:.4f}")

# Get the actual best model object
overall_best_model = best_models[overall_best_model_name]

# Perform a detailed evaluation of the overall best model on the test set
print(f"\n--- Detailed Evaluation of the Overall Best Model ({overall_best_model_name}) ---")

# Predict probabilities for ROC AUC and PR AUC
final_predictions_proba = overall_best_model.predict_proba(X_test)[:, 1]
# Predict class labels for other metrics
final_predictions_labels = overall_best_model.predict(X_test)


final_auc = roc_auc_score(y_test, final_predictions_proba)
final_aupr = average_precision_score(y_test, final_predictions_proba) # AUPRC (Area Under Precision-Recall Curve)
final_accuracy = accuracy_score(y_test, final_predictions_labels)
final_f1_score = f1_score(y_test, final_predictions_labels, average='weighted')
final_precision = precision_score(y_test, final_predictions_labels, average='weighted')
final_recall = recall_score(y_test, final_predictions_labels, average='weighted')

print(f"  Final Area Under ROC (AUC): {final_auc:.4f}")
print(f"  Final Area Under PR: {final_aupr:.4f}")
print(f"  Final Accuracy: {final_accuracy:.4f}")
print(f"  Final F1-Score: {final_f1_score:.4f}")
print(f"  Final Weighted Precision: {final_precision:.4f}")
print(f"  Final Weighted Recall: {final_recall:.4f}")


# --- Step 14: Saving the Overall Best Model ---
print("\n--- Step 14: Saving the Overall Best Model ---")
# Ensure the model_save_base_path directory exists before saving
model_save_base_path = "/home/talentum/shared/Project/lung_cancer_project/model_assets"
if not os.path.exists(model_save_base_path):
    os.makedirs(model_save_base_path)

final_best_model_save_path = os.path.join(model_save_base_path, f"best_model_{overall_best_model_name.replace(' ', '_').lower()}.joblib")

try:
    joblib.dump(overall_best_model, final_best_model_save_path)
    print(f"  Overall best model ({overall_best_model_name}) saved to: {final_best_model_save_path}")
except Exception as e:
    print(f"  Error saving overall best model: {e}")

# In this setup, there's no Spark RDD/DataFrame to unpersist from caching after `collect()`
# train_df.unpersist() # Commented out as train_df is not cached in this model training approach
# test_df.unpersist()  # Commented out as test_df is not cached in this model training approach

# Stop the SparkSession
spark.stop()
print("\nSparkSession stopped.")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, anoth

SyntaxError: invalid syntax (<ipython-input-35-3d4b07c7fdda>, line 12)

In [38]:
# Intialization (re-run these if you are in a new session)
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6"
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, trim, datediff, to_date, unix_timestamp, count, avg # Added datediff, to_date, unix_timestamp, count, avg for feature engineering
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler # Import ML transformers
from pyspark.ml import Pipeline # Import Pipeline for chaining transformations

# Create a SparkSession
spark = SparkSession.builder.appName("LungCancerMLPreprocessing").getOrCreate()

# Load the dataset (assuming it's already in the specified location)
file_path = "file:///home/talentum/shared/Project/lung_cancer_project/Lung_Cancer.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# --- Initial Preprocessing Steps (Copied from your provided code, assuming it ran successfully) ---
# This part is crucial to ensure the DataFrame 'df' has the correct initial types and no missing values
# before proceeding with ML-specific preprocessing.

# Data Type Conversion
df = df.withColumn("diagnosis_date", col("diagnosis_date").cast("date"))
df = df.withColumn("end_treatment_date", col("end_treatment_date").cast("date"))

boolean_like_cols = ["hypertension", "asthma", "cirrhosis", "other_cancer", "survived"]
for col_name in boolean_like_cols:
    df = df.withColumn(col_name, col(col_name).cast("int"))

# Re-verify Missing Values (optional, but good for robustness if data changes)
# Since your previous run confirmed no missing values, this block mostly serves as a safeguard.
print("\n--- Re-Verifying Missing Values Before Advanced Preprocessing ---")
missing_values_check = {}
for column in df.columns:
    if df.schema[column].dataType.typeName() == "string":
        missing_count = df.filter(col(column).isNull() | (trim(col(column)) == "")).count()
    else:
        missing_count = df.filter(col(column).isNull()).count()
    if missing_count > 0:
        missing_values_check[column] = missing_count

if missing_values_check:
    print("WARNING: Missing values detected after initial load. Imputing before ML preprocessing:")
    # Impute numerical columns with their mean
    numerical_cols_for_imputation = ["age", "bmi", "cholesterol_level"]
    for col_name in numerical_cols_for_imputation:
        if col_name in df.columns and col_name in missing_values_check:
            mean_value = df.agg(avg(col_name)).collect()[0][0]
            if mean_value is not None:
                df = df.fillna({col_name: mean_value})
                print(f"  Imputed numerical column '{col_name}' with mean: {mean_value:.2f}")

    # Impute categorical columns with their mode
    categorical_cols_for_imputation = ["gender", "country", "cancer_stage", "family_history", "smoking_status", "treatment_type"]
    for col_name in categorical_cols_for_imputation:
        if col_name in df.columns and col_name in missing_values_check:
            mode_value_row = df.groupBy(col_name).count().orderBy(col("count").desc()).first()
            if mode_value_row:
                mode_value = mode_value_row[0]
                if mode_value is not None:
                    df = df.withColumn(col_name,
                                      when(col(col_name).isNull() | (trim(col(col_name)) == ""), lit(mode_value))
                                      .otherwise(col(col_name)))
                    print(f"  Imputed categorical column '{col_name}' with mode: '{mode_value}'")
                else:
                    print(f"  Could not determine mode for '{col_name}' (value was None).")
            else:
                print(f"  No distinct values found for '{col_name}' to calculate mode.")
else:
    print("No missing values detected. Proceeding to ML preprocessing.")


# --- Step 5: Feature Engineering ---
# (Continuing from previous steps 1-4, which were data inspection and initial cleaning)
print("\n--- Step 5: Feature Engineering ---")

# 5.1: Calculate Treatment Duration in days
# Using datediff to get the difference in days between two date columns
df = df.withColumn("treatment_duration_days", datediff(col("end_treatment_date"), col("diagnosis_date")))

# Handle potential negative or null treatment durations:
#   - Set negative durations to 0 (treatment might have been shorter or records are off)
#   - Impute nulls (which could arise if diagnosis_date or end_treatment_date were null initially or became null after cast)
df = df.withColumn("treatment_duration_days", when(col("treatment_duration_days").isNull(), lit(None)) # Mark original nulls
                                                .when(col("treatment_duration_days") < 0, 0)
                                                .otherwise(col("treatment_duration_days")))

# After handling negative values, now impute any remaining NULLs in treatment_duration_days
missing_duration_count = df.filter(col("treatment_duration_days").isNull()).count()
if missing_duration_count > 0:
    mean_duration = df.agg(avg("treatment_duration_days")).collect()[0][0]
    if mean_duration is not None:
        df = df.fillna({"treatment_duration_days": mean_duration})
        print(f"  Imputed 'treatment_duration_days' with mean: {mean_duration:.2f} due to nulls after calculation.")
    else:
        print(f"  Could not impute 'treatment_duration_days' as mean was None (e.g., all values are null).")
else:
    print("  'treatment_duration_days' created and is clean.")

# 5.2: Create Age Groups (Categorical from Numerical)
df = df.withColumn("age_group",
                   when((col("age") >= 18) & (col("age") <= 30), "18-30")
                   .when((col("age") > 30) & (col("age") <= 45), "31-45")
                   .when((col("age") > 45) & (col("age") <= 60), "46-60")
                   .when(col("age") > 60, "61+")
                   .otherwise("Unknown")) # Catch-all for ages outside expected range or null

# 5.3: Create BMI Categories
df = df.withColumn("bmi_category",
                   when(col("bmi") < 18.5, "Underweight")
                   .when((col("bmi") >= 18.5) & (col("bmi") < 25), "Normal weight")
                   .when((col("bmi") >= 25) & (col("bmi") < 30), "Overweight")
                   .when(col("bmi") >= 30, "Obese")
                   .otherwise("Unknown")) # Catch-all for invalid or null BMI

# 5.4: Create a 'has_comorbidity' feature (binary)
df = df.withColumn("has_comorbidity",
                   when((col("hypertension") == 1) | (col("asthma") == 1) |
                        (col("cirrhosis") == 1) | (col("other_cancer") == 1), 1)
                   .otherwise(0))
print("  New features 'treatment_duration_days', 'age_group', 'bmi_category', 'has_comorbidity' created.")

print("\nSchema after Feature Engineering:")
df.printSchema()
print("\nFirst 5 rows after Feature Engineering:")
df.show(5)


# --- Step 6: Categorical Feature Encoding (One-Hot Encoding using Pipeline) ---
print("\n--- Step 6: Categorical Feature Encoding (One-Hot Encoding) ---")

# Define categorical columns to encode.
# These columns contain string values that need to be converted to numerical features.
categorical_cols = ["gender", "country", "cancer_stage", "family_history",
                    "smoking_status", "treatment_type", "age_group", "bmi_category"]

# Prepare pipeline stages for categorical encoding
# Each categorical column will go through StringIndexer (string to index) then OneHotEncoder (index to vector)
string_indexer_stages = [StringIndexer(inputCol=column, outputCol=column + "_indexed", handleInvalid="keep")
                         for column in categorical_cols]
one_hot_encoder_stages = [OneHotEncoder(inputCol=column + "_indexed", outputCol=column + "_encoded", dropLast=True)
                          for column in categorical_cols]

# Collect the names of the one-hot encoded columns
encoded_cols = [column + "_encoded" for column in categorical_cols]

# Combine all categorical encoding stages
stages = string_indexer_stages + one_hot_encoder_stages
print(f"  Prepared StringIndexer and OneHotEncoder for {len(categorical_cols)} categorical columns.")


# --- Step 7: Numerical Feature Assembly and Scaling (using Pipeline) ---
print("\n--- Step 7: Numerical Feature Assembly and Scaling ---")

# Define numerical columns for scaling.
# These include original numerical columns and the newly engineered numerical feature.
numerical_cols = ["age", "bmi", "cholesterol_level", "treatment_duration_days"]

# Add existing binary/integer features that are already numerical (0/1) and don't need encoding
binary_cols = ["hypertension", "asthma", "cirrhosis", "other_cancer", "has_comorbidity"]

# Combine all feature column names (numerical + binary + one-hot encoded)
# This list will be the input for the VectorAssembler
feature_cols = numerical_cols + binary_cols + encoded_cols

# Create a VectorAssembler to combine all selected feature columns into a single vector column.
# This single vector column ("features_unscaled") is the required input format for ML algorithms in Spark.
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_unscaled", handleInvalid="keep")
stages.append(assembler)
print(f"  Combined {len(feature_cols)} individual features into a single 'features_unscaled' vector.")

# Create a StandardScaler to scale the combined numerical features.
# 'withStd=True' means scaling to unit variance (std dev = 1)
# 'withMean=True' means centering to zero mean (mean = 0)
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features_scaled",
                        withStd=True, withMean=True)
stages.append(scaler)
print("  Prepared StandardScaler to scale 'features_unscaled' into 'features_scaled'.")


# --- Step 8: Build and Run the Preprocessing Pipeline ---
print("\n--- Step 8: Building and Running Preprocessing Pipeline ---")

# Create a PySpark Pipeline from all the defined stages
pipeline = Pipeline(stages=stages)

# Fit the pipeline to the entire DataFrame.
# The .fit() method learns the parameters for transformers (e.g., StringIndexer learns category mappings,
# StandardScaler learns mean/std dev).
pipeline_model = pipeline.fit(df)

# Transform the DataFrame using the fitted pipeline.
# This applies all the learned transformations to the data.
processed_df = pipeline_model.transform(df)
print("  Preprocessing pipeline fitted and data transformed successfully.")

# --- Step 9: Final Data Preparation for ML Models ---
print("\n--- Step 9: Final Data Preparation for ML Models ---")

# Select only the features vector and the target label.
# This is the DataFrame that will be used for training and testing ML models.
final_ml_df = processed_df.select(col("features_scaled").alias("features"), col("survived").alias("label"))

print("\nSchema of the final DataFrame ready for ML:")
final_ml_df.printSchema()

print("\nFirst 5 rows of the final DataFrame ready for ML (features are dense/sparse vectors):")
final_ml_df.show(5, truncate=False)

# --- Step 10: Save the Preprocessing Pipeline ---
# This step is crucial for deployment, as you'll apply the exact same transformations
# to new, unseen user input data before making predictions.
pipeline_save_path = "/home/talentum/shared/Project/lung_cancer_project/model_assets/preprocessing_pipeline"
print(f"\n--- Step 10: Saving Preprocessing Pipeline to: {pipeline_save_path} ---")
try:
    # Use overwrite() to update the pipeline if it already exists
    pipeline_model.write().overwrite().save(pipeline_save_path)
    print("  Preprocessing pipeline saved successfully.")
except Exception as e:
    print(f"  Error saving pipeline: {e}")
    print("  Please ensure the directory exists and Spark has write permissions.")

# --- Next: Model Training and Evaluation ---
print("\n--- Next Steps: Model Training and Evaluation ---")
print("  The 'final_ml_df' is now ready. You would proceed with:")
print("  1. Splitting 'final_ml_df' into training and test sets (e.g., 80/20 split).")
print("  2. Training Logistic Regression, LinearSVC (for SVM), and GBTClassifier (for XGBoost) models.")
print("  3. Evaluating these models on the test set using appropriate metrics (e.g., accuracy, precision, recall, F1-score, AUC).")
print("  4. Saving the best performing trained model(s) for later deployment.")


# Stop the SparkSession
spark.stop()
print("\nSparkSession stopped.")

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37139)
Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/talentum/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:37139)