In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("CO2 Emissions Analysis") \
    .getOrCreate()

# Load the dataset
data = spark.read.csv("/content/CO2 Emissions_Canada.csv", header=True, inferSchema=True)

# Feature Selection
from pyspark.ml.feature import VectorAssembler

feature_columns = ["Engine Size(L)", "Fuel Consumption City (L/100 km)",
                   "Fuel Consumption Hwy (L/100 km)", "Fuel Consumption Comb (L/100 km)",
                   "Cylinders"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data).select("Make", "Model", "features", "CO2 Emissions(g/km)")

# Train-Test Split
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Regression Model
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="CO2 Emissions(g/km)")
lr_model = lr.fit(train_data)

# Predictions
predictions = lr_model.transform(test_data)

# Evaluation
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="CO2 Emissions(g/km)", predictionCol="prediction")
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"RMSE: {rmse}")
print(f"R²: {r2}")




RMSE: 20.250295061325048
R²: 0.8827279337661433


In [2]:
# --------------------- TASK 3: K-MEANS CLUSTERING --------------------- #

original_data = spark.read.csv("/content/CO2 Emissions_Canada.csv", header=True, inferSchema=True)

from pyspark.ml.feature import StandardScaler

clustering_features = ["Fuel Consumption City (L/100 km)",
                       "Fuel Consumption Hwy (L/100 km)",
                       "Fuel Consumption Comb (L/100 km)",
                       "CO2 Emissions(g/km)"]

assembler = VectorAssembler(inputCols=clustering_features, outputCol="features")
clustering_data = assembler.transform(original_data).select("Make", "Model", "features")

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withMean=True, withStd=True)
scaled_data = scaler.fit(clustering_data).transform(clustering_data)

from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol="scaled_features", k=3, seed=42)
model = kmeans.fit(scaled_data)
clustered_data = model.transform(scaled_data)

# Convert cluster index to labels
cluster_labels = {0: "Low Emission", 1: "Medium Emission", 2: "High Emission"}
label_udf = udf(lambda x: cluster_labels.get(int(x), "Unknown"), StringType())

clustered_data = clustered_data.withColumn("Cluster Label", label_udf("prediction"))

from pyspark.sql.types import ArrayType, FloatType
from pyspark.sql.functions import udf

# Define UDF to convert vector column to an array
vector_to_array_udf = udf(lambda vector: vector.toArray().tolist(), ArrayType(FloatType()))

# Convert 'features' to an array column
clustered_data = clustered_data.withColumn("features_array", vector_to_array_udf("features"))

# Extract each feature into its own column
for i, col_name in enumerate(clustering_features):
    clustered_data = clustered_data.withColumn(col_name, clustered_data["features_array"][i])

# Drop the intermediate array column
clustered_data = clustered_data.drop("features_array")


# Remove unnecessary columns
clustered_data = clustered_data.select("Make", "Model", "Fuel Consumption City (L/100 km)",
                                       "Fuel Consumption Hwy (L/100 km)", "Fuel Consumption Comb (L/100 km)",
                                       "CO2 Emissions(g/km)", "Cluster Label")

# Save as CSV
clustered_data.toPandas().to_csv("clustered_data.csv", index=False)

In [3]:
# --------------------- TASK 4: CLASSIFICATION --------------------- #
from pyspark.sql.functions import when, col

classification_data = original_data.withColumn(
    "emission_category",
    when(col("CO2 Emissions(g/km)") <= 150, "Low")
    .when((col("CO2 Emissions(g/km)") > 150) & (col("CO2 Emissions(g/km)") <= 250), "Medium")
    .otherwise("High")
)

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="emission_category", outputCol="label")
classification_data = indexer.fit(classification_data).transform(classification_data)

features = ["Engine Size(L)", "Cylinders", "Fuel Consumption Comb (L/100 km)"]
assembler = VectorAssembler(inputCols=features, outputCol="features")

classification_data = assembler.transform(classification_data).select("Make", "Model", "features", "label")

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Train-Test Split
train_data, test_data = classification_data.randomSplit([0.8, 0.2], seed=42)

# Train the model
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
dt_model = dt.fit(train_data)

# Predictions
predictions = dt_model.transform(test_data)

# Evaluate Model
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Decision Tree Accuracy: {accuracy:.4f}")
print("Feature Importances: ", dt_model.featureImportances)

# Convert numerical predictions back to categorical labels
def label_to_category(label):
    mapping = {0.0: "Low", 1.0: "Medium", 2.0: "High"}
    return mapping.get(label, "Unknown")

label_to_category_udf = udf(label_to_category, StringType())

predictions = predictions.withColumn("Predicted Category", label_to_category_udf("prediction"))
predictions = predictions.withColumn("Actual Category", label_to_category_udf("label"))

# Save only necessary columns
predictions = predictions.select("Make", "Model", "Actual Category", "Predicted Category")

# Save as CSV
predictions.toPandas().to_csv("classification_results.csv", index=False)

Decision Tree Accuracy: 0.9567
Feature Importances:  (3,[0,1,2],[0.01696196241810251,0.0004482826209203784,0.9825897549609771])
