#### Load the Dataset into Spark


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TreeCoverClassification").getOrCreate()
df = spark.read.csv("train.csv", header=True, inferSchema=True)


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/train.csv.

#### Separate Features and Target



In [None]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [col for col in df.columns if col != "Cover_Type"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df = assembler.transform(df)

# Rename target column for clarity
df = df.withColumnRenamed("Cover_Type", "label")


#### Split the data into training and testing sets.




In [None]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)


#### Standardize the Features




In [None]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(train_data)
train_data = scaler_model.transform(train_data)
test_data = scaler_model.transform(test_data)


#### Replace DecisionTreeClassifier and RandomForestClassifier from scikit-learn with Spark MLlib versions




In [None]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier ,GBTClassifier

# Decision Tree
dt = DecisionTreeClassifier(featuresCol="scaledFeatures", labelCol="label")
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)

# Random Forest
rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="label", numTrees=100)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)



#### Results




In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(dt_predictions)

# F1 Score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(dt_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1_score}")




Accuracy: 0.6668914362778152
F1 Score: 0.6485856839016301


In [None]:
# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy1 = accuracy_evaluator.evaluate(rf_predictions)

# F1 Score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score1 = f1_evaluator.evaluate(rf_predictions)

print(f"Accuracy: {accuracy1}")
print(f"F1 Score: {f1_score1}")




Accuracy: 0.7039784221173297
F1 Score: 0.6900678760250907


#### Trying to improve Random Forest

In [None]:
# Random Forest
rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="label", numTrees=100,minInstancesPerNode=5,maxDepth= 7)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)


In [None]:
# Accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy2 = accuracy_evaluator.evaluate(rf_predictions)

# F1 Score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score2 = f1_evaluator.evaluate(rf_predictions)

print(f"Accuracy: {accuracy2}")
print(f"F1 Score: {f1_score2}")


Accuracy: 0.7400539447066756
F1 Score: 0.7309724431620451


In [None]:
# Random Forest
rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="label", numTrees=100,minInstancesPerNode=5,maxDepth= 7,subsamplingRate = 0.5)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)


# Train the model
rf_model = rf.fit(train_data)

# Make predictions
rf_predictions = rf_model.transform(test_data)

# Evaluate the model (accuracy, F1 score, etc.)
accuracy = accuracy_evaluator.evaluate(rf_predictions)
f1_score = f1_evaluator.evaluate(rf_predictions)

print(f"Random Forest Accuracy: {accuracy}")
print(f"Random Forest F1 Score: {f1_score}")


Random Forest Accuracy: 0.7400539447066756
Random Forest F1 Score: 0.7309666287772693


In [None]:
# Random Forest
rf = RandomForestClassifier(featuresCol="scaledFeatures", labelCol="label", numTrees=100,minInstancesPerNode=5,maxDepth= 7,subsamplingRate = 0.5)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)


# Train the model
rf_model = rf.fit(train_data)

# Make predictions
rf_predictions = rf_model.transform(test_data)

# Evaluate the model (accuracy, F1 score, etc.)
accuracy = accuracy_evaluator.evaluate(rf_predictions)
f1_score = f1_evaluator.evaluate(rf_predictions)

print(f"Random Forest Accuracy: {accuracy}")
print(f"Random Forest F1 Score: {f1_score}")


Random Forest Accuracy: 0.7400539447066756
Random Forest F1 Score: 0.7309666287772693


After adjusting the model's parameters, I found that the optimal configuration was achieved with the following settings: numTrees=100, minInstancesPerNode=5, maxDepth=7, and subsamplingRate=0.5.

With these parameters, the Random Forest model achieved an accuracy of 72.66% and an F1 score of 71.64%


### Save the Trained Model


In [None]:
import os
os.environ['HADOOP_HOME'] = 'C:\\hadoop'
os.environ['PATH'] += os.pathsep + os.path.join(os.environ['HADOOP_HOME'], 'bin')

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType

class ForestCoverPredictor:
    def __init__(self, model_path):
        # Initialize Spark Session
        self.spark = SparkSession.builder \
            .appName("ForestCoverTypePrediction") \
            .getOrCreate()

        # Load the saved model
        self.model = RandomForestClassificationModel.load(model_path)

        # Define feature columns
        self.feature_columns = [
            'Elevation', 'Aspect', 'Slope',
            'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
            'Horizontal_Distance_To_Roadways',
            'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
            'Horizontal_Distance_To_Fire_Points'
        ]

        # Add Wilderness Area columns
        self.feature_columns.extend([
            'Wilderness_Area1', 'Wilderness_Area2',
            'Wilderness_Area3', 'Wilderness_Area4'
        ])

        # Add Soil Type columns
        self.feature_columns.extend([f'Soil_Type{i}' for i in range(1, 41)])

        # Create feature assembler
        self.assembler = VectorAssembler(
            inputCols=self.feature_columns,
            outputCol="features"
        )

        # Create scaler
        self.scaler = StandardScaler(
            inputCol="features",
            outputCol="scaledFeatures",
            withStd=True,
            withMean=True
        )

    def create_prediction_schema(self):
        """
        Create a schema for input data
        """
        schema = StructType([
            StructField("Id", IntegerType(), True)
        ] + [
            StructField(col, DoubleType(), True)
            for col in self.feature_columns
        ])
        return schema

    def preprocess_data(self, input_data):
        """
        Preprocess input data for prediction
        """
        # Assemble features
        assembled = self.assembler.transform(input_data)

        # Scale features
        scaled = self.scaler.transform(assembled)

        return scaled

    def predict(self, input_data):
        """
        Make predictions on input data

        Args:
        input_data (spark.DataFrame): DataFrame with feature columns

        Returns:
        spark.DataFrame: DataFrame with predictions
        """
        # Preprocess data
        preprocessed = self.preprocess_data(input_data)

        # Make predictions
        predictions = self.model.transform(preprocessed)

        return predictions

    def predict_from_list(self, input_list):
        """
        Make prediction from a list of feature values

        Args:
        input_list (list): List of feature values matching the feature columns

        Returns:
        int: Predicted forest cover type
        """
        # Create DataFrame from input list
        df = self.spark.createDataFrame([input_list], schema=self.create_prediction_schema())

        # Make prediction
        predictions = self.predict(df)

        # Return predicted cover type
        return predictions.select("prediction").collect()[0]["prediction"]

    def close(self):
        """
        Close the Spark session
        """
        self.spark.stop()

# Example Usage
def main():
    # Path where the model was saved

    # Initialize the predictor
    predictor = rf_model

    # Example input (replace with actual values)
    sample_input = [
        1,  # Id
        2500.0,  # Elevation
        180.0,  # Aspect
        15.0,  # Slope
        300.0,  # Horizontal_Distance_To_Hydrology
        50.0,   # Vertical_Distance_To_Hydrology
        1500.0, # Horizontal_Distance_To_Roadways
        200.0,  # Hillshade_9am
        250.0,  # Hillshade_Noon
        180.0,  # Hillshade_3pm
        2000.0, # Horizontal_Distance_To_Fire_Points
        1, 0, 0, 0,  # Wilderness Areas
        *([0] * 40)  # Soil Types (40 zeros as placeholder)
    ]

    # Make a prediction
    prediction = predictor.predict_from_list(sample_input)
    print(f"Predicted Forest Cover Type: {prediction}")

    # Close the Spark session
    predictor.close()

if __name__ == "__main__":
    main()

AttributeError: 'RandomForestClassificationModel' object has no attribute 'predict_from_list'

In [None]:
#!pip install gradio
#!pip install --upgrade typing_extensions
#!pip show typing_extensions
#!pip install --upgrade gradio pydantic fastapi
!pip install --upgrade typing-extensions


In [None]:
import gradio as gr


In [None]:

from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql import SparkSession
import gradio as gr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RandomForestDeployment") \
    .getOrCreate()

# Load the trained Random Forest model
model = rf_model

# Define the prediction function
def predict_forest_cover(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
                         Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
                         Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
                         Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
                         Wilderness_Area3, Wilderness_Area4, *Soil_Types):
    # Create a DataFrame for the input
    data = [(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
             Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
             Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
             Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
             Wilderness_Area3, Wilderness_Area4, *Soil_Types)]
    columns = [
        "Id", "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2",
        "Wilderness_Area3", "Wilderness_Area4"
    ] + [f"Soil_Type{i}" for i in range(1, 41)]

    input_df = spark.createDataFrame(data, columns)

    # Make predictions
    predictions = model.transform(input_df)
    prediction = predictions.select("prediction").collect()[0]["prediction"]
    return int(prediction)

# Define Gradio interface
inputs = [
    gr.inputs.Number(label="Id"),
    gr.inputs.Number(label="Elevation"),
    gr.inputs.Number(label="Aspect"),
    gr.inputs.Number(label="Slope"),
    gr.inputs.Number(label="Horizontal Distance to Hydrology"),
    gr.inputs.Number(label="Vertical Distance to Hydrology"),
    gr.inputs.Number(label="Horizontal Distance to Roadways"),
    gr.inputs.Number(label="Hillshade 9am"),
    gr.inputs.Number(label="Hillshade Noon"),
    gr.inputs.Number(label="Hillshade 3pm"),
    gr.inputs.Number(label="Horizontal Distance to Fire Points"),
    gr.inputs.Checkbox(label="Wilderness Area 1"),
    gr.inputs.Checkbox(label="Wilderness Area 2"),
    gr.inputs.Checkbox(label="Wilderness Area 3"),
    gr.inputs.Checkbox(label="Wilderness Area 4")
] + [gr.inputs.Checkbox(label=f"Soil Type {i}") for i in range(1, 41)]

output = gr.outputs.Label(label="Cover Type")

app = gr.Interface(fn=predict_forest_cover, inputs=inputs, outputs=output, live=True)

# Launch the app
app.launch()


AttributeError: module 'gradio' has no attribute 'inputs'

In [None]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql import SparkSession
import gradio as gr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RandomForestDeployment") \
    .getOrCreate()

# Load the trained Random Forest model
model_path = "path_to_your_saved_model"
model = rf_model


# Define the prediction function
def predict_forest_cover(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
                         Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
                         Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
                         Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
                         Wilderness_Area3, Wilderness_Area4, *Soil_Types):
    # Convert Checkbox inputs to integers (1 if checked, else 0)
    wilderness_areas = [int(Wilderness_Area1), int(Wilderness_Area2), int(Wilderness_Area3), int(Wilderness_Area4)]
    soil_types = [int(soil) for soil in Soil_Types]

    # Create a DataFrame for the input
    data = [(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
             Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
             Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
             Horizontal_Distance_To_Fire_Points, *wilderness_areas, *soil_types)]
    columns = [
        "Id", "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2",
        "Wilderness_Area3", "Wilderness_Area4"
    ] + [f"Soil_Type{i}" for i in range(1, 41)]

    input_df = spark.createDataFrame(data, columns)

    # Make predictions
    predictions = model.transform(input_df)
    prediction = predictions.select("prediction").collect()[0]["prediction"]
    return int(prediction)

# Define Gradio interface
inputs = [
    gr.Number(label="Id"),
    gr.Number(label="Elevation"),
    gr.Number(label="Aspect"),
    gr.Number(label="Slope"),
    gr.Number(label="Horizontal Distance to Hydrology"),
    gr.Number(label="Vertical Distance to Hydrology"),
    gr.Number(label="Horizontal Distance to Roadways"),
    gr.Number(label="Hillshade 9am"),
    gr.Number(label="Hillshade Noon"),
    gr.Number(label="Hillshade 3pm"),
    gr.Number(label="Horizontal Distance to Fire Points"),
    gr.Checkbox(label="Wilderness Area 1"),
    gr.Checkbox(label="Wilderness Area 2"),
    gr.Checkbox(label="Wilderness Area 3"),
    gr.Checkbox(label="Wilderness Area 4")
] + [gr.Checkbox(label=f"Soil Type {i}") for i in range(1, 41)]

output = gr.Label(label="Cover Type")

app = gr.Interface(fn=predict_forest_cover, inputs=inputs, outputs=output, live=True)

# Launch the app
app.launch()


In [None]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql import SparkSession
import gradio as gr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RandomForestDeployment") \
    .getOrCreate()

# Load the trained Random Forest model
model_path = "path_to_your_saved_model"
model = RandomForestClassificationModel.load(model_path)

# Define the prediction function
def predict_cover_type(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
                       Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
                       Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
                       Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
                       Wilderness_Area3, Wilderness_Area4, Soil_Types):
    # Prepare input data as a dictionary
    input_data = {
        "Id": Id,
        "Elevation": Elevation,
        "Aspect": Aspect,
        "Slope": Slope,
        "Horizontal_Distance_To_Hydrology": Horizontal_Distance_To_Hydrology,
        "Vertical_Distance_To_Hydrology": Vertical_Distance_To_Hydrology,
        "Horizontal_Distance_To_Roadways": Horizontal_Distance_To_Roadways,
        "Hillshade_9am": Hillshade_9am,
        "Hillshade_Noon": Hillshade_Noon,
        "Hillshade_3pm": Hillshade_3pm,
        "Horizontal_Distance_To_Fire_Points": Horizontal_Distance_To_Fire_Points,
        "Wilderness_Area1": int(Wilderness_Area1),
        "Wilderness_Area2": int(Wilderness_Area2),
        "Wilderness_Area3": int(Wilderness_Area3),
        "Wilderness_Area4": int(Wilderness_Area4),
    }

    # Add Soil Type columns dynamically
    for i in range(1, 41):
        input_data[f"Soil_Type{i}"] = 1 if i in Soil_Types else 0

    # Convert to Spark DataFrame
    input_df = spark.createDataFrame([input_data])

    # Perform prediction
    predictions = model.transform(input_df)
    prediction = predictions.select("prediction").collect()[0]["prediction"]

    return f"Predicted Cover Type: {int(prediction)}"

# Define Gradio interface
inputs = [
    gr.Number(label="Id"),
    gr.Number(label="Elevation"),
    gr.Number(label="Aspect"),
    gr.Number(label="Slope"),
    gr.Number(label="Horizontal Distance to Hydrology"),
    gr.Number(label="Vertical Distance to Hydrology"),
    gr.Number(label="Horizontal Distance to Roadways"),
    gr.Number(label="Hillshade 9am"),
    gr.Number(label="Hillshade Noon"),
    gr.Number(label="Hillshade 3pm"),
    gr.Number(label="Horizontal Distance to Fire Points"),
    gr.Checkbox(label="Wilderness Area 1"),
    gr.Checkbox(label="Wilderness Area 2"),
    gr.Checkbox(label="Wilderness Area 3"),
    gr.Checkbox(label="Wilderness Area 4"),
    gr.CheckboxGroup([i for i in range(1, 41)], label="Select Soil Types"),
]

output = gr.Textbox(label="Prediction Result")

# Create Gradio interface
app = gr.Interface(fn=predict_cover_type, inputs=inputs, outputs=output)

# Launch the app
app.launch()


In [None]:
import pkg_resources

requirements = [
    "aiofiles>=22.0,<24.0",
    "anyio>=3.0,<5.0",
    "fastapi>=0.115.2,<1.0",
    "ffmpy",
    "gradio_client==1.5.2",
    "httpx>=0.24.1",
    "huggingface_hub>=0.25.1",
    "Jinja2<4.0",
    "markupsafe~=2.0",
    "numpy>=1.0,<3.0",
    "orjson~=3.0",
    "packaging",
    "pandas>=1.0,<3.0",
    "pillow>=8.0,<12.0",
    "pydantic>=2.0",
    "python-multipart>=0.0.18",
    "pydub",
    "pyyaml>=5.0,<7.0",
    "ruff>=0.2.2",
    "safehttpx>=0.1.6,<0.2.0",
    "semantic_version~=2.0",
    "starlette>=0.40.0,<1.0",
    "tomlkit>=0.12.0,<0.14.0",
    "typer>=0.12,<1.0",
    "typing_extensions~=4.0",
    "uvicorn>=0.14.0",
]

for req in requirements:
    try:
        pkg_resources.require(req)
        print(f"{req} is satisfied.")
    except pkg_resources.DistributionNotFound as e:
        print(f"{req} is NOT installed: {e}")
    except pkg_resources.VersionConflict as e:
        print(f"{req} has a version conflict: {e}")


In [None]:
#!pip uninstall typing_extensions
#!pip install typing_extensions~=4.0


In [None]:
#!pip show typing_extensions


In [None]:
#!pip uninstall typing_extensions


In [None]:
#!pip install typing_extensions==4.0


In [None]:
#!pip install --upgrade typing-extensions


In [None]:
#!pip install huggingface_hub


In [None]:
!python --version

Python 3.11.7


In [None]:
!pip show gradio

Name: gradio
Version: 5.9.0
Summary: Python library for easily interacting with trained machine learning models
Home-page: https://github.com/gradio-app/gradio
Author: 
Author-email: Abubakar Abid <gradio-team@huggingface.co>, Ali Abid <gradio-team@huggingface.co>, Ali Abdalla <gradio-team@huggingface.co>, Dawood Khan <gradio-team@huggingface.co>, Ahsen Khaliq <gradio-team@huggingface.co>, Pete Allen <gradio-team@huggingface.co>, Ömer Faruk Özdemir <gradio-team@huggingface.co>, Freddy A Boulton <gradio-team@huggingface.co>, Hannah Blair <gradio-team@huggingface.co>
License: Apache-2.0
Location: C:\Users\user\anaconda3\Lib\site-packages
Requires: aiofiles, anyio, fastapi, ffmpy, gradio-client, httpx, huggingface-hub, jinja2, markupsafe, numpy, orjson, packaging, pandas, pillow, pydantic, pydub, python-multipart, pyyaml, ruff, safehttpx, semantic-version, starlette, tomlkit, typer, typing-extensions, uvicorn
Required-by: 


Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\blocks.py", line 1594, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
      

Using existing dataset file at: .gradio\flagged\dataset1.csv


In [None]:
import gradio as gr
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RandomForestDeployment") \
    .getOrCreate()

# Load the trained Random Forest model
model_path = "path_to_your_saved_model"
model = rf_model

# Define the prediction function
def predict_forest_cover(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
                         Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
                         Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
                         Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
                         Wilderness_Area3, Wilderness_Area4, *Soil_Types):
    # Create a DataFrame for the input
    data = [(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
             Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
             Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
             Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
             Wilderness_Area3, Wilderness_Area4, *Soil_Types)]
    columns = [
        "Id", "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2",
        "Wilderness_Area3", "Wilderness_Area4"
    ] + [f"Soil_Type{i}" for i in range(1, 41)]



    # Assuming the features in input_df are already in vector format, we can scale them

    # Define the StandardScaler
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

    # Apply the scaler to the input dataframe
    scaled_df = scaler.transform(input_df)

    # Now pass the scaled features to the model for prediction
    predictions = model.transform(scaled_df)
    prediction = predictions.select("prediction").collect()[0]["prediction"]

    return int(prediction)

# Define Gradio interface
inputs = [
    gr.Number(label="Id"),
    gr.Number(label="Elevation"),
    gr.Number(label="Aspect"),
    gr.Number(label="Slope"),
    gr.Number(label="Horizontal Distance to Hydrology"),
    gr.Number(label="Vertical Distance to Hydrology"),
    gr.Number(label="Horizontal Distance to Roadways"),
    gr.Number(label="Hillshade 9am"),
    gr.Number(label="Hillshade Noon"),
    gr.Number(label="Hillshade 3pm"),
    gr.Number(label="Horizontal Distance to Fire Points"),
    gr.Checkbox(label="Wilderness Area 1"),
    gr.Checkbox(label="Wilderness Area 2"),
    gr.Checkbox(label="Wilderness Area 3"),
    gr.Checkbox(label="Wilderness Area 4")
] + [gr.Checkbox(label=f"Soil Type {i}") for i in range(1, 41)]

output = gr.Label(label="Cover Type")

app = gr.Interface(fn=predict_forest_cover, inputs=inputs, outputs=output, live=True)

# Launch the app
app.launch()
test_data = [
    (0, 2550, 48, 11, 110, 22, 210, 155, 210, 215, 520, 0, 1, 0, 0, *[0]*40),
]


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\blocks.py", line 1594, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
      

In [None]:
import gradio as gr
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder \
    .appName("RandomForestDeployment") \
    .getOrCreate()

# Load the trained Random Forest model
#model_path =
model = rf_model

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql import DataFrame

def predict_forest_cover(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
                         Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
                         Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
                         Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
                         Wilderness_Area3, Wilderness_Area4, *Soil_Types):
    # Create a DataFrame for the input
    data = [(Id, Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology,
             Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways,
             Hillshade_9am, Hillshade_Noon, Hillshade_3pm,
             Horizontal_Distance_To_Fire_Points, Wilderness_Area1, Wilderness_Area2,
             Wilderness_Area3, Wilderness_Area4, *Soil_Types)]
    columns = [
        "Id", "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
        "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2",
        "Wilderness_Area3", "Wilderness_Area4"
    ] + [f"Soil_Type{i}" for i in range(1, 41)]

    input_df = spark.createDataFrame(data, schema=columns)

    # Assemble features into a single vector column
    feature_cols = columns[1:]  # Exclude "Id"
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
    assembled_df = assembler.transform(input_df)

    # Scale the features
    scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)
    scaler_model = scaler.fit(assembled_df)
    scaled_df = scaler_model.transform(assembled_df)

    # Predict using the trained model
    predictions = model.transform(scaled_df)
    prediction = predictions.select("prediction").collect()[0]["prediction"]

    return int(prediction)


# Define Gradio interface
inputs = [
    gr.Number(label="Elevation"),
    gr.Number(label="Aspect"),
    gr.Number(label="Slope"),
    gr.Number(label="Horizontal Distance to Hydrology"),
    gr.Number(label="Vertical Distance to Hydrology"),
    gr.Number(label="Horizontal Distance to Roadways"),
    gr.Number(label="Hillshade 9am"),
    gr.Number(label="Hillshade Noon"),
    gr.Number(label="Hillshade 3pm"),
    gr.Number(label="Horizontal Distance to Fire Points"),
    gr.Checkbox(label="Wilderness Area 1"),
    gr.Checkbox(label="Wilderness Area 2"),
    gr.Checkbox(label="Wilderness Area 3"),
    gr.Checkbox(label="Wilderness Area 4")
] + [gr.Checkbox(label=f"Soil Type {i}") for i in range(1, 41)]


output = gr.Label(label="Cover Type")

app = gr.Interface(fn=predict_forest_cover, inputs=inputs, outputs=output)

# Launch the app
app.launch()


* Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\gradio\blocks.py", line 1594, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
      

Created dataset file at: .gradio\flagged\dataset2.csv
