In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import os

# Initialize Spark session without Hadoop (local mode)
spark = SparkSession.builder.appName("ASL_Model").config("spark.hadoop.fs.defaultFS", "file:///").getOrCreate()

# Check if Spark session is active
print(spark.version)

# Define dataset path
data_folder = r"ASL-Sensor-Dataglove-Dataset\ASL-Sensor-Dataglove-Dataset\025"

# Function to load CSV files and create a label column based on the file name
def load_csv_files(folder_path):
    dataframes = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            label = file_name.split(".")[0]  # Use the file name as the label
            df = spark.read.csv(os.path.join(folder_path, file_name), header=True, inferSchema=True)
            df = df.withColumn("label", lit(label))  # Add a label column using lit
            dataframes.append(df)
    return dataframes

# Load data
dataframes = load_csv_files(data_folder)
dataset = dataframes[0]
for df in dataframes[1:]:
    dataset = dataset.union(df)

# Inspect dataset columns
dataset.printSchema()

# Correct column selection
selected_columns = ["flex_1", "flex_2", "flex_3", "flex_4", "flex_5", "GYRx", "GYRy", "GYRz", "label"]
dataset = dataset.select(*selected_columns)

# Handle missing values by dropping rows with null values
dataset = dataset.dropna()

# Encode labels using StringIndexer
indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
dataset = indexer.fit(dataset).transform(dataset)

# Assemble features into a single vector using VectorAssembler
feature_columns = ["flex_1", "flex_2", "flex_3", "flex_4", "flex_5", "GYRx", "GYRy", "GYRz"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
dataset = assembler.transform(dataset)

# Split data into train and test sets
train_data, test_data = dataset.randomSplit([0.8, 0.2], seed=42)

# Train a RandomForest model
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100)
model = rf.fit(train_data)

# Evaluate the model
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

# Print model accuracy
print(f"Model Accuracy: {accuracy}")


3.5.3
root
 |-- timestamp: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- flex_1: double (nullable = true)
 |-- flex_2: double (nullable = true)
 |-- flex_3: double (nullable = true)
 |-- flex_4: double (nullable = true)
 |-- flex_5: double (nullable = true)
 |-- Qw: double (nullable = true)
 |-- Qx: double (nullable = true)
 |-- Qy: double (nullable = true)
 |-- Qz: double (nullable = true)
 |-- GYRx: double (nullable = true)
 |-- GYRy: double (nullable = true)
 |-- GYRz: double (nullable = true)
 |-- ACCx: double (nullable = true)
 |-- ACCy: double (nullable = true)
 |-- ACCz: double (nullable = true)
 |-- ACCx_body: double (nullable = true)
 |-- ACCy_body: double (nullable = true)
 |-- ACCz_body: double (nullable = true)
 |-- ACCx_world: double (nullable = true)
 |-- ACCy_world: double (nullable = true)
 |-- ACCz_world: double (nullable = true)
 |-- label: string (nullable = false)

Model Accuracy: 0.9579909441556264
