In [1]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("LIBSVM Data Loading").getOrCreate()

# Path to your LIBSVM file
file_path = "/home/centos/data/sample_libsvm_data.txt"

# Read the LIBSVM file
df_phuong = spark.read.format("libsvm").load(file_path)

# Show the DataFrame
df_phuong.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



In [2]:
# Count the number of records
num_records = df_phuong.count()
print(f"Number of Records: {num_records}")

# Count the number of columns
num_columns = len(df_phuong.columns)
print(f"Number of Columns: {num_columns}")

# Print the inferred schema
print("Inferred Schema:")
df_phuong.printSchema()

Number of Records: 100
Number of Columns: 2
Inferred Schema:
root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [3]:
from pyspark.ml.feature import StringIndexer

# Create an instance of the StringIndexer
labelIndexer_phuong = StringIndexer(inputCol="label", outputCol="indexedLabel_phuong")

# Fit the StringIndexer to the data and transform it
df_indexed = labelIndexer_phuong.fit(df_phuong).transform(df_phuong)

In [4]:
from pyspark.ml.feature import VectorIndexer

# Create an instance of the VectorIndexer
featureIndexer_phuong = VectorIndexer(inputCol="features", outputCol="indexedFeatures_phuong", maxCategories=4, handleInvalid="skip")

# Fit the VectorIndexer to the data and transform it
df_indexed_features = featureIndexer_phuong.fit(df_phuong).transform(df_phuong)

# Show the transformed DataFrame
df_indexed_features.show()

+-----+--------------------+----------------------+
|label|            features|indexedFeatures_phuong|
+-----+--------------------+----------------------+
|  0.0|(692,[127,128,129...|  (692,[127,128,129...|
|  1.0|(692,[158,159,160...|  (692,[158,159,160...|
|  1.0|(692,[124,125,126...|  (692,[124,125,126...|
|  1.0|(692,[152,153,154...|  (692,[152,153,154...|
|  1.0|(692,[151,152,153...|  (692,[151,152,153...|
|  0.0|(692,[129,130,131...|  (692,[129,130,131...|
|  1.0|(692,[158,159,160...|  (692,[158,159,160...|
|  1.0|(692,[99,100,101,...|  (692,[99,100,101,...|
|  0.0|(692,[154,155,156...|  (692,[154,155,156...|
|  0.0|(692,[127,128,129...|  (692,[127,128,129...|
|  1.0|(692,[154,155,156...|  (692,[154,155,156...|
|  0.0|(692,[153,154,155...|  (692,[153,154,155...|
|  0.0|(692,[151,152,153...|  (692,[151,152,153...|
|  1.0|(692,[129,130,131...|  (692,[129,130,131...|
|  0.0|(692,[154,155,156...|  (692,[154,155,156...|
|  1.0|(692,[150,151,152...|  (692,[150,151,152...|
|  0.0|(692,

In [5]:
from pyspark.ml.feature import VectorIndexer

# a. Name of Input Column
input_column = featureIndexer_phuong.getInputCol()

# b. Name of Output Column
output_column = featureIndexer_phuong.getOutputCol()

# c. Number of Features
num_features = len(df_phuong.select("features").first()[0])

# d. Map of Categories
# Get the VectorIndexerModel from the transformation
vectorIndexer_model = featureIndexer_phuong.fit(df_phuong)
# Get the category maps
category_maps = vectorIndexer_model.categoryMaps

# Print the information
print(f"Name of Input Column: {input_column}")
print(f"Name of Output Column: {output_column}")
print(f"Number of Features: {num_features}")
print("Map of Categories:")
print(category_maps)

Name of Input Column: features
Name of Output Column: indexedFeatures_phuong
Number of Features: 692
Map of Categories:
{645: {0.0: 0}, 69: {0.0: 0}, 365: {0.0: 0}, 138: {0.0: 0}, 479: {0.0: 0}, 333: {0.0: 0}, 249: {0.0: 0}, 0: {0.0: 0}, 666: {0.0: 0, 10.0: 1}, 88: {0.0: 0}, 170: {0.0: 0}, 115: {0.0: 0}, 276: {0.0: 0, 3.0: 1, 153.0: 2, 252.0: 3}, 308: {0.0: 0}, 5: {0.0: 0}, 449: {0.0: 0}, 120: {0.0: 0, 253.0: 1}, 614: {0.0: 0, 140.0: 1}, 677: {0.0: 0}, 202: {0.0: 0, 13.0: 1, 44.0: 2, 87.0: 3}, 10: {0.0: 0}, 56: {0.0: 0}, 533: {0.0: 0}, 142: {0.0: 0}, 340: {0.0: 0}, 670: {0.0: 0}, 174: {0.0: 0, 175.0: 1}, 42: {0.0: 0}, 417: {0.0: 0}, 24: {0.0: 0}, 37: {0.0: 0}, 25: {0.0: 0}, 257: {0.0: 0, 73.0: 1, 120.0: 2}, 389: {0.0: 0}, 52: {0.0: 0}, 14: {0.0: 0}, 504: {0.0: 0}, 110: {0.0: 0}, 587: {0.0: 0}, 619: {0.0: 0}, 196: {0.0: 0}, 559: {0.0: 0}, 638: {0.0: 0, 1.0: 1, 29.0: 2, 137.0: 3}, 20: {0.0: 0}, 421: {0.0: 0}, 46: {0.0: 0}, 93: {0.0: 0}, 284: {0.0: 0}, 228: {0.0: 0}, 448: {0.0: 0}, 57: {0

In [6]:
# Split the data into training (65%) and test (35%) sets
training_phuong, testing_phuong = df_phuong.randomSplit([0.65, 0.35], seed=42)

# Show the size of each DataFrame
print(f"Number of records in training set: {training_phuong.count()}")
print(f"Number of records in testing set: {testing_phuong.count()}")

Number of records in training set: 61
Number of records in testing set: 39


In [7]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create an instance of the DecisionTreeClassifier
DT_phuong = DecisionTreeClassifier(labelCol="indexedLabel_phuong", featuresCol="indexedFeatures_phuong")

In [8]:
from pyspark.ml import Pipeline

# Create a Pipeline with the three stages
pipeline_phuong = Pipeline(stages=[labelIndexer_phuong, featureIndexer_phuong, DT_phuong])

In [9]:
# Fit the pipeline to the training data
model_phuong = pipeline_phuong.fit(training_phuong)

In [10]:
# Use the model to make predictions on the testing data
predictions_phuong = model_phuong.transform(testing_phuong)

In [11]:
# Print the schema of the predictions DataFrame
predictions_phuong.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- indexedLabel_phuong: double (nullable = false)
 |-- indexedFeatures_phuong: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a VectorIndexer with a handleInvalid parameter
featureIndexer_phuong = VectorIndexer(inputCol="features_phuong", outputCol="indexedFeatures_phuong", maxCategories=4, handleInvalid="skip")

# Create an evaluator for accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel_phuong", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

# Compute the accuracy
accuracy = evaluator.evaluate(predictions_phuong)

# Compute the test error
test_error = 1 - accuracy

# Print the results
print(f"Accuracy: {accuracy}")
print(f"Test Error: {test_error}")


Accuracy: 1.0
Test Error: 0.0


In [13]:
# Select the columns of interest for the first 10 rows
predictions_phuong.select("label", "prediction", "features").show(10)


+-----+----------+--------------------+
|label|prediction|            features|
+-----+----------+--------------------+
|  0.0|       1.0|(692,[124,125,126...|
|  0.0|       1.0|(692,[126,127,128...|
|  0.0|       1.0|(692,[126,127,128...|
|  0.0|       1.0|(692,[126,127,128...|
|  0.0|       1.0|(692,[127,128,129...|
|  1.0|       0.0|(692,[123,124,125...|
|  1.0|       0.0|(692,[123,124,125...|
|  1.0|       0.0|(692,[124,125,126...|
|  1.0|       0.0|(692,[124,125,126...|
|  1.0|       0.0|(692,[126,127,128...|
+-----+----------+--------------------+
only showing top 10 rows

