# 3D Machine Learning: Point Cloud Semantic Segmentation


In [8]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.metrics import classification_report  # For evaluating classification performance
from sklearn.ensemble import RandomForestClassifier  # For building a Random Forest classifier
from sklearn.preprocessing import MinMaxScaler  # For scaling features to a specified range


# Define the file path to the dataset

In [9]:

data_folder= r"C:\Users\rosha\Desktop\My projects\3DDL"
dataset= r"C:\Users\rosha\Desktop\My projects\3DDL\3DML_urban_point_cloud.xyz"

In [11]:
# Read the dataset into a pandas DataFrame
pcd = pd.read_csv(dataset, delimiter=' ')

# Remove any rows with missing values (NaNs)
pcd.dropna(inplace=True)

In [12]:
# Display the DataFrame to inspect the loaded dataset
pcd


Unnamed: 0,X,Y,Z,R,G,B,omnivariance_2,normal_cr_2,ScanDirectionFlag,NumberOfReturns,planarity_2,nb_neighbors_1,Intensity,Classification,omnivariance_1,verticality_1
0,-0.35,7.780000,193.240005,83,88,85,0.068255,0.000660,1.0,2.0,0.395102,4.0,1338.0,1.0,0.002344,0.013035
1,-0.34,8.350000,193.190002,173,164,176,0.064437,0.000567,1.0,1.0,0.444954,4.0,1921.0,1.0,0.014229,0.015639
2,-0.41,8.920000,193.039993,158,158,164,0.071727,0.000864,1.0,1.0,0.420157,5.0,2445.0,1.0,0.017933,0.016346
3,-0.30,9.490000,193.009995,136,136,138,0.078786,0.001038,1.0,1.0,0.349275,4.0,3269.0,1.0,0.006866,0.004476
4,-0.31,10.520000,193.000000,194,185,198,0.029530,0.000065,1.0,2.0,0.491427,4.0,1688.0,1.0,0.002832,0.005706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3612124,-39.66,983.429993,202.850006,214,226,238,0.105032,0.000864,0.0,2.0,0.851765,7.0,627.0,3.0,0.051912,0.112306
3612125,-22.58,587.150024,189.729996,68,65,44,0.161776,0.002411,0.0,1.0,0.836145,9.0,1922.0,3.0,0.052564,0.015288
3612126,-16.52,586.570007,189.740005,42,51,44,0.461649,0.082355,0.0,2.0,0.637373,6.0,1689.0,3.0,0.030623,0.001556
3612127,-15.07,587.299988,189.759995,52,70,56,0.587114,0.106482,0.0,1.0,0.356182,4.0,2467.0,3.0,0.095947,0.701525


In [13]:
# Get the number of columns in the DataFrame
num_columns = pcd.shape[1]
num_columns


16

In [14]:
# Get the unique values in the 'Classification' column
unique_values = pcd['Classification'].unique()

# Display the unique values
unique_values


array([1., 2., 3.])

In [15]:
# Extract the target labels from the 'Classification' column
labels = pcd['Classification']

# Extract the features from selected columns
features = pcd[['X', 'Y', 'Z', 'R', 'G', 'B', 'omnivariance_2', 'normal_cr_2', 
                'NumberOfReturns', 'planarity_2', 'omnivariance_1', 'verticality_1']]


In [16]:
# Get the count of labels
labels_count = labels.count()
labels_count


3379477

In [17]:
# Get the count of features
features_count = features.count()
features_count


X                  3379477
Y                  3379477
Z                  3379477
R                  3379477
G                  3379477
B                  3379477
omnivariance_2     3379477
normal_cr_2        3379477
NumberOfReturns    3379477
planarity_2        3379477
omnivariance_1     3379477
verticality_1      3379477
dtype: int64

In [18]:
import numpy as np

In [19]:
# Calculate class frequencies
class_labels, class_counts = np.unique(labels, return_counts=True)

In [20]:
# Get the minimum and maximum values of the 'X' column
min_value = pcd['X'].min()
max_value = pcd['X'].max()

# Print the minimum and maximum values
print("Minimum value of 'X' column:", min_value)
print("Maximum value of 'X' column:", max_value)



Minimum value of 'X' column: -1000.0
Maximum value of 'X' column: -0.01


In [21]:
# Import the MinMaxScaler class from scikit-learn preprocessing module
from sklearn.preprocessing import MinMaxScaler

# Scale the features using MinMaxScaler
features_scaled = MinMaxScaler().fit_transform(features)


In [22]:
features_scaled

array([[0.99966   , 0.00778008, 0.32504444, ..., 0.39516567, 0.00919584,
        0.013035  ],
       [0.99967   , 0.00835008, 0.3240576 , ..., 0.44504995, 0.05582233,
        0.015639  ],
       [0.9996    , 0.00892009, 0.32109709, ..., 0.42023689, 0.07035363,
        0.016346  ],
       ...,
       [0.98348983, 0.58657588, 0.25597004, ..., 0.63759352, 0.12013825,
        0.001556  ],
       [0.98493985, 0.58730587, 0.25636453, ..., 0.35622047, 0.37641331,
        0.701525  ],
       [0.98695987, 0.90945912, 0.49200698, ..., 0.59899455, 0.2653022 ,
        0.299063  ]])

In [23]:
# Split the scaled features and labels into training and testing sets
# using the train_test_split function from scikit-learn
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.4)


In [24]:
# Create a RandomForestClassifier instance
rf_classifier = RandomForestClassifier()


In [25]:
# Train the RandomForestClassifier model on the training data
rf_classifier.fit(X_train, y_train)


In [26]:
# Make predictions on the testing data using the trained RandomForestClassifier model
rf_predictions = rf_classifier.predict(X_test)


In [30]:
import matplotlib.pyplot as plt

In [32]:
# Print the classification report to evaluate the performance of the RandomForestClassifier model
print(classification_report(y_test, rf_predictions))


              precision    recall  f1-score   support

         1.0       0.99      1.00      0.99    690548
         2.0       0.98      0.99      0.98    427955
         3.0       0.98      0.96      0.97    233288

    accuracy                           0.99   1351791
   macro avg       0.98      0.98      0.98   1351791
weighted avg       0.99      0.99      0.99   1351791



In [33]:
# Import the KNeighborsClassifier class from scikit-learn neighbors module
from sklearn.neighbors import KNeighborsClassifier

# Create a KNeighborsClassifier instance
knn_classifier = KNeighborsClassifier()


In [34]:
# Train the KNeighborsClassifier model on the training data
knn_classifier.fit(X_train, y_train)


In [35]:
# Make predictions on the testing data using the trained KNeighborsClassifier model
knn_predictions = knn_classifier.predict(X_test)


In [36]:
# Print the classification report to evaluate the performance of the KNeighborsClassifier model
print(classification_report(y_test, knn_predictions))


              precision    recall  f1-score   support

         1.0       0.97      0.99      0.98    690548
         2.0       0.98      0.97      0.97    427955
         3.0       0.97      0.93      0.95    233288

    accuracy                           0.97   1351791
   macro avg       0.97      0.96      0.97   1351791
weighted avg       0.97      0.97      0.97   1351791



In [37]:
# Import the MLPClassifier class from scikit-learn neural_network module
from sklearn.neural_network import MLPClassifier

# Create an MLPClassifier instance with specified parameters
mlp_classifier = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(15, 2), random_state=1)


In [38]:
# Train the MLPClassifier model on the training data
mlp_classifier.fit(X_train, y_train)


In [39]:
# Make predictions on the testing data using the trained MLPClassifier model
mlp_predictions = mlp_classifier.predict(X_test)


In [40]:
# Print the classification report to evaluate the performance of the MLPClassifier model
print(classification_report(y_test, mlp_predictions))


              precision    recall  f1-score   support

         1.0       0.99      1.00      1.00    690548
         2.0       0.98      0.98      0.98    427955
         3.0       0.97      0.97      0.97    233288

    accuracy                           0.99   1351791
   macro avg       0.98      0.98      0.98   1351791
weighted avg       0.99      0.99      0.99   1351791



In [41]:
# Define the file path to the validation dataset
val_dataset = r"C:\Users\rosha\Desktop\My projects\3DDL\3DML_validation.xyz"

# Read the validation dataset into a pandas DataFrame
val_pcd = pd.read_csv(val_dataset, delimiter=' ')

# Remove any rows with missing values (NaNs)
val_pcd.dropna(inplace=True)


In [42]:
# Extract the target labels from the 'Classification' column of the validation dataset
val_labels = val_pcd['Classification']

# Extract the features from selected columns of the validation dataset
val_features = val_pcd[['X', 'Y', 'Z', 'R', 'G', 'B', 'omnivariance_2', 'normal_cr_2', 
                        'NumberOfReturns', 'planarity_2', 'omnivariance_1', 'verticality_1']]

# Scale the features of the validation dataset using MinMaxScaler
val_features_scaled = MinMaxScaler().fit_transform(val_features)


In [43]:
# Make predictions on the scaled features of the validation dataset using the trained RandomForestClassifier model
val_predictions = rf_classifier.predict(val_features_scaled)

# Print the classification report to evaluate the performance of the model on the validation dataset
# Specify target names for better interpretation of the classification report
print(classification_report(val_labels, val_predictions, target_names=['ground', 'vegetation', 'buildings']))


              precision    recall  f1-score   support

      ground       0.87      0.29      0.44   1188768
  vegetation       0.90      0.90      0.90   1315231
   buildings       0.38      0.87      0.53    613317

    accuracy                           0.66   3117316
   macro avg       0.72      0.69      0.62   3117316
weighted avg       0.79      0.66      0.65   3117316



# Summary of Results
# The machine learning models (Random Forest, K-Nearest Neighbors, MLP) were trained and evaluated on the provided dataset.
# Overall, the Random Forest classifier demonstrated the best performance, achieving high accuracy and balanced precision and recall scores across different classes.
# K-Nearest Neighbors and MLP classifiers also showed decent performance but were slightly outperformed by the Random Forest classifier.

# Future Work
# There are several avenues for future work and improvements:
# - Further hyperparameter tuning could potentially improve the performance of the models.
# - Experimenting with different feature engineering techniques or incorporating additional features may lead to better predictions.
# - Exploring alternative machine learning algorithms or ensemble methods could provide insights into improving model robustness.

# Acknowledgements
# This project was inspired by the blog post "3D Machine Learning Course: Point Cloud Semantic Segmentation" by Florent Poux, published on Medium.
# The blog provided valuable insights and guidance on the topic of point cloud semantic segmentation, which served as a reference for this project.
# Link to the blog: https://medium.com/towards-data-science/3d-machine-learning-course-point-cloud-semantic-segmentation-9b32618ca5df


# Conclusion
# In conclusion, this project successfully demonstrated the application of machine learning techniques for classification tasks on spatial data.
# The trained models showed promising performance in predicting different classes within the dataset.
# Further optimizations and refinements could lead to even better results, but overall, this project contributes to the understanding and application of machine learning in spatial analysis.



# End of Notebook
