In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from scipy.stats import circmean
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Testing 1

## Loading Data

Indian Data Set (Very few Curves)

In [8]:
df1 = pd.read_csv("../Data/highway_geometry_data_every_10m_india_train.csv")

In [None]:
df1

Czech Data Set

In [None]:
df = pd.read_csv("../Data/training-data.txt", delim_whitespace=True)
df

In [None]:
len(df['id'].unique())

In [None]:
df1 = df[df['id']==1]
df1

## Pre-Processing


In [None]:
df1['Normalised_X'] = (df1['x'] - df1['x'].min())/(df1['x'].max() - df1['x'].min())
df1['Normalised_Y'] = (df1['y'] - df1['y'].min())/(df1['y'].max() - df1['y'].min())
df1

In [None]:
plt.plot(df1['Normalised_X'], df1['Normalised_Y'], 'o', markersize=1)
plt.show()

In [None]:
# Define image dimensions (example: 512x512)
image_width = 256
image_height = 256

# Map Normalized X and Y to pixel space (continuous mapping)
df1['pixel_X'] = df1['Normalised_X'] * (image_width - 1)
df1['pixel_Y'] = df1['Normalised_Y'] * (image_height - 1)

# Initialize a white image (RGB) with all pixels set to white (255, 255, 255)
image = np.ones((image_height, image_width, 3), dtype=np.uint8) * 255  # RGB = white

# Set pixel colors based on the Curve column (Red for curve, Blue for straight)
for i, row in df1.iterrows():
    x, y = int(row['pixel_X']), int(row['pixel_Y'])
    if row['class'] == 1:
        image[y, x] = [255, 0, 0]  # Red for curve
    else:
        image[y, x] = [0, 0, 255]  # Blue for straight

# Show the image
plt.imshow(image)
plt.title('Road Segments: Curves (Red) vs Straight (Blue)')
plt.axis('off')  # Turn off axis
plt.show()

## Adding Features

In [16]:
# Function to compute the angle between three consecutive points
def compute_angle(p1, p2, p3):
    # Vector from p1 to p2
    v1 = np.array([p2[0] - p1[0], p2[1] - p1[1]])
    # Vector from p2 to p3
    v2 = np.array([p3[0] - p2[0], p3[1] - p2[1]])
    # Compute angle between vectors using the dot product formula
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return np.arccos(dot_product / (norm_v1 * norm_v2)) * 180 / np.pi  # angle in degrees

# Function to compute the distance between two points
def compute_distance(p1, p2):
    return np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)

In [17]:
def compute_curvature(p1, p2, p3):
    # Compute curvature as the reciprocal of radius of the circle passing through 3 points
    a = np.linalg.norm(np.array(p2) - np.array(p3))
    b = np.linalg.norm(np.array(p1) - np.array(p3))
    c = np.linalg.norm(np.array(p1) - np.array(p2))
    
    s = (a + b + c) / 2  # Semi-perimeter
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))  # Heron's formula for area of a triangle
    if a * b * c == 0:  # Prevent division by zero
        return 0
    return 4 * area / (a * b * c)

In [18]:
def compute_slope(p1, p2):
    if p2[0] - p1[0] == 0:  # Avoid division by zero
        return float('inf')  # Infinite slope (vertical line)
    return (p2[1] - p1[1]) / (p2[0] - p1[0])

In [19]:
def compute_slope_change(p1, p2, p3):
    slope1 = compute_slope(p1, p2)
    slope2 = compute_slope(p2, p3)
    return abs(slope2 - slope1)

In [None]:
# Add features to the dataframe
angles = []
distances = []
curvature = []
slope_change = []
for i in range(1, len(df1) - 1):  # Starting from the second point and ending at the second-to-last point
    p1 = (df1['x'][i-1], df1['y'][i-1])
    p2 = (df1['x'][i], df1['y'][i])
    p3 = (df1['x'][i+1], df1['y'][i+1])
    
    angles.append(compute_angle(p1, p2, p3))
    curvature.append(compute_curvature(p1,p2,p3))
    slope_change.append(compute_slope_change(p1,p2,p3))
    distances.append(compute_distance(p2, p1))

pl1 = (df1['x'][len(df1)-2],df1['y'][len(df1)-2])
pl2 = (df1['x'][len(df1)-1],df1['y'][len(df1)-1])
distances.append(compute_distance(pl1,pl2))

len(distances), len(angles), len(curvature), len(slope_change)

Replace inf values with max float

In [21]:
slope_change = np.array(slope_change)
slope_change[np.isinf(slope_change)] = np.finfo(np.float64).max

In [None]:
# The first and last points cannot have angles, but we can still compute distances
angles = [np.nan] + angles + [np.nan]
curvature = [np.nan] + curvature + [np.nan]
slope_change = np.concatenate(([np.nan], slope_change, [np.nan]))


# Add the features to the dataframe
df1['Angle'] = angles
df1['Curvature'] = curvature
df1['Slope_Change'] = slope_change
df1['Distance_to_prev'] = [np.nan] + distances
df1['Distance_to_next'] = distances + [np.nan]

# Display the dataframe
df1

In [None]:
def add_surrounding_points_features(df):
    # Create columns for previous two points and next two points
    df['x_prev_2'] = df['Normalised_X'].shift(2)
    df['y_prev_2'] = df['Normalised_Y'].shift(2)
    df['x_prev_1'] = df['Normalised_X'].shift(1)
    df['y_prev_1'] = df['Normalised_Y'].shift(1)
    df['x_next_1'] = df['Normalised_X'].shift(-1)
    df['y_next_1'] = df['Normalised_Y'].shift(-1)
    df['x_next_2'] = df['Normalised_X'].shift(-2)
    df['y_next_2'] = df['Normalised_Y'].shift(-2)
    
    return df

# Add previous and next points as features
df1 = add_surrounding_points_features(df1)

In [24]:
# Define features and target variable
features = ['Angle', 'Distance_to_prev', 'Distance_to_next','Normalised_X',
            'Normalised_Y','Curvature','Slope_Change','x_prev_2','x_prev_1',
            'y_prev_2','y_prev_1','x_next_1','x_next_2','y_next_1','y_next_2']
X = df1[features][2:-2]
y = df1['class'][2:-2]

## Running Algo

In [None]:
# Scale features (important for some ML algorithms like SVM, Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))


In [None]:
# Optionally, print feature importances
print("Feature Importances:", model.feature_importances_)

## Testing on a different road

In [None]:
df2 = df[df['id']==2]
df2 = df2.reset_index(drop=True)
df2

In [None]:
df2['Normalised_Y'] = (df2['y'] - df2['y'].min())/(df2['y'].max() - df2['y'].min())
df2['Normalised_X'] = (df2['x'] - df2['x'].min())/(df2['x'].max() - df2['x'].min())
df2

In [None]:
# Add features to the dataframe
angles2 = []
distances2 = []
curvature2 = []
slope_change2 = []
for i in range(1, len(df2) - 1):  # Starting from the second point and ending at the second-to-last point
    p1 = (df2['x'][i-1], df2['y'][i-1])
    p2 = (df2['x'][i], df2['y'][i])
    p3 = (df2['x'][i+1], df2['y'][i+1])
    
    angles2.append(compute_angle(p1, p2, p3))
    curvature2.append(compute_curvature(p1,p2,p3))
    slope_change2.append(compute_slope_change(p1,p2,p3))
    distances2.append(compute_distance(p2, p1))

pl21 = (df2['x'][len(df2)-2],df2['y'][len(df2)-2])
pl22 = (df2['x'][len(df2)-1],df2['y'][len(df2)-1])
distances2.append(compute_distance(pl21,pl22))

len(distances2), len(angles2), len(curvature2), len(slope_change2)

In [33]:
slope_change2 = np.array(slope_change2)
slope_change2[np.isinf(slope_change2)] = np.finfo(np.float64).max

In [None]:
# The first and last points cannot have angles, but we can still compute distances
angles2 = [np.nan] + angles2 + [np.nan]
curvature2 = [np.nan] + curvature2 + [np.nan]
slope_change2 = np.concatenate(([np.nan], slope_change2, [np.nan]))


# Add the features to the dataframe
df2['Angle'] = angles2
df2['Curvature'] = curvature2
df2['Slope_Change'] = slope_change2
df2['Distance_to_prev'] = [np.nan] + distances2
df2['Distance_to_next'] = distances2 + [np.nan]

# Display the dataframe
df2

In [35]:
df2 = add_surrounding_points_features(df2)

In [36]:
X2 = df2[features][2:-2]
y2 = df2['class'][2:-2]

In [37]:
X2_scaled = scaler.fit_transform(X2)

In [None]:
# Make predictions and evaluate
y_pred2 = model.predict(X2_scaled)

# Print evaluation metrics
print(classification_report(y2, y_pred2))

# Testing 2

## Creating Features

In [39]:
# Function to compute the angle between three consecutive points
def compute_angle(p1, p2, p3):
    # Vector from p1 to p2
    v1 = np.array([p2[0] - p1[0], p2[1] - p1[1]])
    # Vector from p2 to p3
    v2 = np.array([p3[0] - p2[0], p3[1] - p2[1]])
    # Compute angle between vectors using the dot product formula
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return np.arccos(dot_product / (norm_v1 * norm_v2)) * 180 / np.pi  # angle in degrees

# Function to compute the distance between two points
def compute_distance(p1, p2):
    return np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)

In [40]:
def compute_curvature(p1, p2, p3):
    # Compute curvature as the reciprocal of radius of the circle passing through 3 points
    a = np.linalg.norm(np.array(p2) - np.array(p3))
    b = np.linalg.norm(np.array(p1) - np.array(p3))
    c = np.linalg.norm(np.array(p1) - np.array(p2))
    
    s = (a + b + c) / 2  # Semi-perimeter
    area = np.sqrt(s * (s - a) * (s - b) * (s - c))  # Heron's formula for area of a triangle
    if a * b * c == 0:  # Prevent division by zero
        return 0
    return 4 * area / (a * b * c)

In [41]:
def compute_slope_change(p1, p2, p3):
    slope1 = compute_slope(p1, p2)
    slope2 = compute_slope(p2, p3)
    return abs(slope2 - slope1)

In [42]:
def compute_slope(p1, p2):
    if p2[0] - p1[0] == 0:  # Avoid division by zero
        return float('inf')  # Infinite slope (vertical line)
    return (p2[1] - p1[1]) / (p2[0] - p1[0])

In [43]:
def add_surrounding_points_features(df):
    # Create columns for previous two points and next two points
    df['x_prev_2'] = df['Normalised_X'].shift(2)
    df['y_prev_2'] = df['Normalised_Y'].shift(2)
    df['x_prev_1'] = df['Normalised_X'].shift(1)
    df['y_prev_1'] = df['Normalised_Y'].shift(1)
    df['x_next_1'] = df['Normalised_X'].shift(-1)
    df['y_next_1'] = df['Normalised_Y'].shift(-1)
    df['x_next_2'] = df['Normalised_X'].shift(-2)
    df['y_next_2'] = df['Normalised_Y'].shift(-2)
    
    return df

In [44]:
features = ['Angle', 'Distance_to_prev', 'Distance_to_next','x',
            'y','Curvature','Slope_Change','x_prev_2','x_prev_1',
            'y_prev_2','y_prev_1','x_next_1','x_next_2','y_next_1','y_next_2']

def create_df(df2):
    #df2['Normalised_Y'] = (df2['y'] - df2['y'].min())/(df2['y'].max() - df2['y'].min())
    #df2['Normalised_X'] = (df2['x'] - df2['x'].min())/(df2['x'].max() - df2['x'].min())

    # Add features to the dataframe
    angles2 = []
    distances2 = []
    curvature2 = []
    slope_change2 = []
    for i in range(1, len(df2) - 1):  # Starting from the second point and ending at the second-to-last point
        p1 = (df2['x'][i-1], df2['y'][i-1])
        p2 = (df2['x'][i], df2['y'][i])
        p3 = (df2['x'][i+1], df2['y'][i+1])

        angles2.append(compute_angle(p1, p2, p3))
        curvature2.append(compute_curvature(p1,p2,p3))
        slope_change2.append(compute_slope_change(p1,p2,p3))
        distances2.append(compute_distance(p2, p1))

    pl21 = (df2['x'][len(df2)-2],df2['y'][len(df2)-2])
    pl22 = (df2['x'][len(df2)-1],df2['y'][len(df2)-1])
    distances2.append(compute_distance(pl21,pl22))

    slope_change2 = np.array(slope_change2)
    slope_change2[np.isinf(slope_change2)] = np.finfo(np.float64).max

    # The first and last points cannot have angles, but we can still compute distances
    angles2 = [np.nan] + angles2 + [np.nan]
    curvature2 = [np.nan] + curvature2 + [np.nan]
    slope_change2 = np.concatenate(([np.nan], slope_change2, [np.nan]))


    # Add the features to the dataframe
    df2['Angle'] = angles2
    df2['Curvature'] = curvature2
    df2['Slope_Change'] = slope_change2
    df2['Distance_to_prev'] = [np.nan] + distances2
    df2['Distance_to_next'] = distances2 + [np.nan]

    df2 = add_surrounding_points_features(df2)

    X2 = df2[features][2:-2]
    y2 = df2['class'][2:-2]

    scaler = StandardScaler()
    X2 = scaler.fit_transform(X2)

    return X2, y2

## Pre-Processing Data

In [None]:
df = pd.read_csv("../Data/training-data.txt", delim_whitespace=True)
df

In [None]:
df['id'].unique()

In [None]:
X = np.empty((0,15))
Y = pd.Series()
for i in df['id'].unique():
    print(i)
    df1 = df[df['id']==i] 
    df1 = df1.reset_index(drop=True)
    x, y = create_df(df1)
    X = np.vstack((X,x))
    Y = pd.concat([Y,y])

In [12]:
# Split data into training and testing sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, Y, test_size=0.2, random_state=42)

## Testing on Various Algos

### Random Forests

In [None]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train1, y_train1)

In [None]:
import joblib

# Save the model to a file
joblib.dump(model, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test1)

# Print evaluation metrics
print(classification_report(y_test1, y_pred))

In [None]:
y_pred

In [None]:
test_y

In [27]:
test_y = np.concatenate(([y_pred[0]],[y_pred[0]], y_pred, [y_pred[-1]],[y_pred[-1]]))

In [None]:
# Optionally, print feature importances
print("Feature Importances:", model.feature_importances_)

### Decision Tree

In [78]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Train the model
model = DecisionTreeClassifier()
model.fit(X_train1, y_train1)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test1)

# Print evaluation metrics
print(classification_report(y_test1, y_pred))

## Dropping Na's and then testing again

In [None]:
X_df = pd.DataFrame(X)
X_df['y'] = np.array(Y)
X_df = X_df.dropna()
X_df

In [107]:
X = X_df.iloc[:, :-1].to_numpy()
Y = X_df['y'].to_numpy()

In [108]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

### KNN

In [111]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Train the model
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

### SVM

In [None]:
from sklearn.svm import SVC

# Train the model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Train the model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

### Random Forests

In [126]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
# Train the model
model = GaussianNB()
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Train the model
model = GradientBoostingClassifier(n_estimators=150, random_state=20)
model.fit(X_train, y_train)

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

## Testing on India Data set(trained on Czech)

In [None]:
df1 = pd.read_csv("../Data/highway_geometry_data_every_10m_india_train.csv")
df1 = df1.iloc[:,:3]
df1.rename(columns={'Curve':'class', 'X':'x','Y':'y'}, inplace = True)
df1

In [163]:
Xi, Yi = create_df(df1)

In [164]:
# Split data into training and testing sets
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(Xi, Yi, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_i, y_train_i)

In [None]:
# Make predictions and evaluateX_test_i
y_pred = model.predict(X_test_i)

# Print evaluation metrics
print(classification_report(y_test_i, y_pred))

# Testing 3

## Correctly Pre-Processing

In [None]:
df = pd.read_csv("../Data/training-data.txt", delim_whitespace=True)
df

Definfing a function that adds the surrounding points into the data

In [4]:
def add_surrounding_points_features(df):
    # Create columns for previous two points and next two points
    df['x_prev_2'] = df['x'].shift(2)
    df['y_prev_2'] = df['y'].shift(2)
    df['x_prev_1'] = df['x'].shift(1)
    df['y_prev_1'] = df['y'].shift(1)
    df['x_next_1'] = df['x'].shift(-1)
    df['y_next_1'] = df['y'].shift(-1)
    df['x_next_2'] = df['x'].shift(-2)
    df['y_next_2'] = df['y'].shift(-2)
    
    #classs = df['class'].to_numpy()
    #df = (df - df.mean())/df.std()
    #df['class'] = classs
    
    return df

Add the surrounding points

In [None]:
DF = pd.DataFrame()
for i in df['id'].unique():
    #print(i)
    df1 = df[df['id']==i].iloc[:,1:]
    df1 = df1.reset_index(drop=True)
    df1 = add_surrounding_points_features(df1)
    DF = pd.concat([df1, DF], ignore_index=True)
    #Y = pd.concat([Y,y])
DF = DF.dropna()
DF

Split the data into training and testing datasets

In [None]:
X = DF.drop(DF.columns[2], axis=1)
Y = DF['class']
X

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## Creating Features

In [8]:
# Function to compute the angle between three consecutive points
def compute_angle(p1, p2, p3):
    # Vector from p1 to p2
    v1 = np.array([p2[0] - p1[0], p2[1] - p1[1]])
    # Vector from p2 to p3
    v2 = np.array([p3[0] - p2[0], p3[1] - p2[1]])
    # Compute angle between vectors using the dot product formula
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return np.arccos(dot_product / (norm_v1 * norm_v2)) * 180 / np.pi  # angle in degrees

# Function to compute the distance between two points
def compute_distance(p1, p2):
    return np.sqrt((p2[0] - p1[0])**2 + (p2[1] - p1[1])**2)

In [9]:
def compute_curvature(p1, p2, p3):
    points = np.array([p1, p2, p3])
    
    # Check if points are collinear
    if np.abs(np.cross(points[1] - points[0], points[2] - points[0])) < 1e-8:
        return 0  # Return 0 curvature for straight lines
    
    # Fit a circle to the points
    x_m = np.mean(points[:, 0])
    y_m = np.mean(points[:, 1])
    u = points[:, 0] - x_m
    v = points[:, 1] - y_m
    Suv = np.sum(u*v)
    Suu = np.sum(u**2)
    Svv = np.sum(v**2)
    Suuv = np.sum(u**2 * v)
    Suvv = np.sum(u * v**2)
    Suuu = np.sum(u**3)
    Svvv = np.sum(v**3)
    
    A = np.array([[Suu, Suv], [Suv, Svv]])
    B = np.array([Suuv + Suvv, Svvv + Suuu])/2
    
    try:
        uc, vc = np.linalg.solve(A, B)
        xc = x_m + uc
        yc = y_m + vc
        R = np.sqrt((points[:, 0]-xc)**2 + (points[:, 1]-yc)**2).mean()
        return 1/R if R > 1e-10 else 0
    except np.linalg.LinAlgError:
        # If matrix is singular, points are likely collinear
        return 0



In [10]:
def compute_slope(p1, p2):
    #if p2[0] - p1[0] == 0:  # Avoid division by zero
    #    return float('inf')  # Infinite slope (vertical line)
    #return (p2[1] - p1[1]) / (p2[0] - p1[0])
    return math.atan2(p2[1] - p1[1], p2[0] - p1[0])

In [11]:
def compute_slope_change(p1, p2, p3):
    slope1 = compute_slope(p1, p2)
    slope2 = compute_slope(p2, p3)
    return abs(slope2 - slope1)

In [12]:

def compute_local_linearity(p1, p2, p3, p4, p5):
    """
    Compute local linearity using HuberRegressor, falling back to LinearRegression if convergence fails.
    
    :param p1, p2, p3, p4, p5: Points as (x, y) tuples
    :return: R-squared value indicating linearity
    """
    points = np.array([p1, p2, p3, p4, p5])
    X = points[:, 0].reshape(-1, 1)
    Y = points[:, 1]
    
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            model = HuberRegressor(max_iter=100, epsilon=1.5).fit(X, Y)
        r_squared = model.score(X, Y)
    except Exception:
        # If HuberRegressor fails, fall back to LinearRegression
        model = LinearRegression().fit(X, Y)
        r_squared = model.score(X, Y)
    
    return r_squared


def compute_angle_consistency(p1, p2, p3, p4, p5):
    """
    Compute angle consistency using circular statistics, which is more appropriate for angular data.
    
    :param p1, p2, p3, p4, p5: Points as (x, y) tuples
    :return: A measure of angle consistency
    """
    def angle_between(v1, v2):
        v1_3d = np.append(v1, 0)
        v2_3d = np.append(v2, 0)
        return np.arctan2(np.linalg.norm(np.cross(v1_3d, v2_3d)), np.dot(v1, v2))

    vectors = [
        np.array(p2) - np.array(p1),
        np.array(p3) - np.array(p2),
        np.array(p4) - np.array(p3),
        np.array(p5) - np.array(p4)
    ]
    
    angles = [angle_between(vectors[i], vectors[i+1]) for i in range(len(vectors)-1)]
    
    # Use circular mean and standard deviation
    mean_angle = circmean(angles)
    angle_deviation = np.std(angles)
    
    # Normalize consistency measure
    consistency = 1 - (angle_deviation / np.pi)
    
    return consistency


In [13]:
#features = ['Angle', 'Distance_to_prev', 'Distance_to_next','x',
#            'y','Curvature','Slope_Change','x_prev_2','x_prev_1',
#            'y_prev_2','y_prev_1','x_next_1','x_next_2','y_next_1','y_next_2']

def Calc_Features(df2, add = np.array([]), mult = np.array([])):
    # Add features to the dataframe
    angles2 = []
    distances2 = []
    curvature2 = []
    slope_change2 = []
    angle_consistency2 = []
    local_linearity2 = []
    for i in range(len(df2)):  # Starting from the second point and ending at the second-to-last point
        p5 = (df2['x_prev_2'][i], df2['y_prev_2'][i])
        p1 = (df2['x_prev_1'][i], df2['y_prev_1'][i])
        p2 = (df2['x'][i], df2['y'][i])
        p3 = (df2['x_next_1'][i], df2['y_next_1'][i])
        p4 = (df2['x_next_2'][i], df2['y_next_2'][i])

        angles2.append(compute_angle(p1, p2, p3))
        curvature2.append(compute_curvature(p1,p2,p3))
        slope_change2.append(compute_slope_change(p1,p2,p3))
        distances2.append(compute_distance(p2, p1))
        angle_consistency2.append(compute_angle_consistency(p5,p1,p2,p3,p4))
        local_linearity2.append(compute_local_linearity(p5,p1,p2,p3,p4))

    # Add the features to the dataframe
    df2['Angle'] = angles2
    df2['Curvature'] = curvature2
    df2['Slope_Change'] = slope_change2
    df2['Distance_to_prev'] = distances2
    #df2['Distance_to_next'] = distances2
    df2['Angle_Consistency'] = angle_consistency2
    df2['Local_Linearity'] =local_linearity2

    if(add.size == 0):
        #df2 = df2.dropna()
        #df2 = df2.reset_index(drop=True)
        add = df2.mean()
        mult = df2.std()
        df2 = (df2 - add)/mult
        return df2, add, mult
    else: 
        df2 = (df2 - add)/mult 
        return df2

#### Running the above on the train data set

In [None]:
X_train = X_train.reset_index(drop=True)
X_train

In [None]:
X_train, add, mult = Calc_Features(X_train)
X_train

Drop rows with nan's

In [16]:
X_train = X_train.dropna()
y_train = y_train.reset_index(drop=True)
y_train = y_train.loc[X_train.index]
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

#### Running this on testing set with above normalised values

In [None]:
X_test = X_test.reset_index(drop=True)
X_test

In [None]:
X_test = Calc_Features(X_test, add=add, mult=mult)
X_test

## Running the model

In [None]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
import joblib

# Save the model to a file
joblib.dump(model, 'random_forest_model.pkl')
print("Model saved to random_forest_model.pkl")

In [None]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

In [None]:
# Optionally, print feature importances
print("Feature Importances:", model.feature_importances_)

# Testing 4

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import circmean

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn.exceptions import ConvergenceWarning


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Pre-Processing

#### Load Datasets

In [2]:
df1 = pd.read_csv("../Data/training-data.txt", delim_whitespace=True)
df1

  df1 = pd.read_csv("../Data/training-data.txt", delim_whitespace=True)


Unnamed: 0,id,x,y,class
0,1,-540378.9400,-1.105517e+06,0
1,1,-540364.6300,-1.105521e+06,0
2,1,-540349.3800,-1.105525e+06,1
3,1,-540340.3800,-1.105532e+06,1
4,1,-540336.0600,-1.105542e+06,1
...,...,...,...,...
2380,4260,-694708.6640,-9.701025e+05,1
2381,4260,-694703.7030,-9.700893e+05,1
2382,4260,-694702.8819,-9.700874e+05,1
2383,4260,-694696.0962,-9.700718e+05,1


In [3]:
df2 = pd.read_csv("../Data/highway_geometry_data_every_10m_india_train.csv")
df2.rename(columns={'X': 'x','Y':'y','Curve':'class'}, inplace=True)
df2

Unnamed: 0,x,y,class,Radius
0,741656.8450,3008792.392,1,0.502513
1,741647.2642,3008794.454,1,0.507614
2,741637.5908,3008796.999,1,0.531915
3,741627.9355,3008799.627,1,0.595238
4,741618.3783,3008802.263,1,0.769231
...,...,...,...,...
3995,704062.4201,2998580.345,1,0.446429
3996,704053.1835,2998576.921,1,0.476190
3997,704043.9329,2998573.270,1,0.497512
3998,704034.8737,2998569.617,1,0.515464


#### Add Spatial Measures as Features

In [4]:
def add_surrounding_points_features(df):
    # Create columns for previous two points and next two points
    df['x_prev_2'] = df['x'].shift(2)
    df['y_prev_2'] = df['y'].shift(2)
    df['x_prev_1'] = df['x'].shift(1)
    df['y_prev_1'] = df['y'].shift(1)
    df['x_next_1'] = df['x'].shift(-1)
    df['y_next_1'] = df['y'].shift(-1)
    df['x_next_2'] = df['x'].shift(-2)
    df['y_next_2'] = df['y'].shift(-2)
    
    return df

In [5]:
DF = pd.DataFrame()
for i in df1['id'].unique():
    #print(i)
    df = df1[df1['id']==i].iloc[:,1:]
    df = df.reset_index(drop=True)
    df = add_surrounding_points_features(df)
    DF = pd.concat([DF, df], ignore_index=True)
    #Y = pd.concat([Y,y])

df = add_surrounding_points_features(df2.iloc[:,:-1])
DF = pd.concat([DF, df], ignore_index=True)

DF = DF.dropna()
DF

Unnamed: 0,x,y,class,x_prev_2,y_prev_2,x_prev_1,y_prev_1,x_next_1,y_next_1,x_next_2,y_next_2
2,-540349.3800,-1105525.130,1,-540378.9400,-1105517.380,-540364.6300,-1105520.880,-540340.3800,-1105531.880,-540336.0600,-1105541.880
3,-540340.3800,-1105531.880,1,-540364.6300,-1105520.880,-540349.3800,-1105525.130,-540336.0600,-1105541.880,-540334.6300,-1105554.250
4,-540336.0600,-1105541.880,1,-540349.3800,-1105525.130,-540340.3800,-1105531.880,-540334.6300,-1105554.250,-540334.1900,-1105567.130
5,-540334.6300,-1105554.250,1,-540340.3800,-1105531.880,-540336.0600,-1105541.880,-540334.1900,-1105567.130,-540336.0600,-1105581.880
6,-540334.1900,-1105567.130,1,-540336.0600,-1105541.880,-540334.6300,-1105554.250,-540336.0600,-1105581.880,-540339.3800,-1105595.630
...,...,...,...,...,...,...,...,...,...,...,...
6378,704081.2154,2998587.073,1,704100.5248,2998592.274,704090.7949,2998589.663,704071.7641,2998583.803,704062.4201,2998580.345
6379,704071.7641,2998583.803,1,704090.7949,2998589.663,704081.2154,2998587.073,704062.4201,2998580.345,704053.1835,2998576.921
6380,704062.4201,2998580.345,1,704081.2154,2998587.073,704071.7641,2998583.803,704053.1835,2998576.921,704043.9329,2998573.270
6381,704053.1835,2998576.921,1,704071.7641,2998583.803,704062.4201,2998580.345,704043.9329,2998573.270,704034.8737,2998569.617


Adding features to check sparsity and density in a local area of the point

In [6]:
def adaptive_sparsity_measure(row, dense_spacing=10):
    distances = [
        np.linalg.norm(np.array([row['x'] - row['x_prev_1'], row['y'] - row['y_prev_1'], 0])),
        np.linalg.norm(np.array([row['x_next_1'] - row['x'], row['y_next_1'] - row['y'], 0]))
    ]
    median_distance = np.median(distances)
    return median_distance / dense_spacing

def local_density_variation(row):
    distances = [
        np.linalg.norm(np.array([row['x_prev_1'] - row['x_prev_2'], row['y_prev_1'] - row['y_prev_2'], 0])),
        np.linalg.norm(np.array([row['x'] - row['x_prev_1'], row['y'] - row['y_prev_1'], 0])),
        np.linalg.norm(np.array([row['x_next_1'] - row['x'], row['y_next_1'] - row['y'], 0])),
        np.linalg.norm(np.array([row['x_next_2'] - row['x_next_1'], row['y_next_2'] - row['y_next_1'], 0]))
    ]
    return np.std(distances) / np.mean(distances)

# Usage:
# df['adaptive_sparsity'] = df.apply(adaptive_sparsity_measure, axis=1)
# df['local_density_variation'] = df.apply(local_density_variation, axis=1)


In [7]:
DF['adaptive_sparsity'] = DF.apply(adaptive_sparsity_measure, axis=1)
DF['local_density_variation'] = DF.apply(local_density_variation, axis=1)
DF

Unnamed: 0,x,y,class,x_prev_2,y_prev_2,x_prev_1,y_prev_1,x_next_1,y_next_1,x_next_2,y_next_2,adaptive_sparsity,local_density_variation
2,-540349.3800,-1105525.130,1,-540378.9400,-1105517.380,-540364.6300,-1105520.880,-540340.3800,-1105531.880,-540336.0600,-1105541.880,1.354057,0.162731
3,-540340.3800,-1105531.880,1,-540364.6300,-1105520.880,-540349.3800,-1105525.130,-540336.0600,-1105541.880,-540334.6300,-1105554.250,1.107161,0.154616
4,-540336.0600,-1105541.880,1,-540349.3800,-1105525.130,-540340.3800,-1105531.880,-540334.6300,-1105554.250,-540334.1900,-1105567.130,1.167280,0.069377
5,-540334.6300,-1105554.250,1,-540340.3800,-1105531.880,-540336.0600,-1105541.880,-540334.1900,-1105567.130,-540336.0600,-1105581.880,1.266995,0.110967
6,-540334.1900,-1105567.130,1,-540336.0600,-1105541.880,-540334.6300,-1105554.250,-540336.0600,-1105581.880,-540339.3800,-1105595.630,1.387779,0.071059
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6378,704081.2154,2998587.073,1,704100.5248,2998592.274,704090.7949,2998589.663,704071.7641,2998583.803,704062.4201,2998580.345,0.996223,0.005559
6379,704071.7641,2998583.803,1,704090.7949,2998589.663,704081.2154,2998587.073,704062.4201,2998580.345,704053.1835,2998576.921,0.998217,0.005600
6380,704062.4201,2998580.345,1,704081.2154,2998587.073,704071.7641,2998583.803,704053.1835,2998576.921,704043.9329,2998573.270,0.990708,0.005566
6381,704053.1835,2998576.921,1,704071.7641,2998583.803,704062.4201,2998580.345,704043.9329,2998573.270,704034.8737,2998569.617,0.989792,0.007929


In [8]:
X = DF.drop(DF.columns[2], axis=1)
Y = DF['class']
X

Unnamed: 0,x,y,x_prev_2,y_prev_2,x_prev_1,y_prev_1,x_next_1,y_next_1,x_next_2,y_next_2,adaptive_sparsity,local_density_variation
2,-540349.3800,-1105525.130,-540378.9400,-1105517.380,-540364.6300,-1105520.880,-540340.3800,-1105531.880,-540336.0600,-1105541.880,1.354057,0.162731
3,-540340.3800,-1105531.880,-540364.6300,-1105520.880,-540349.3800,-1105525.130,-540336.0600,-1105541.880,-540334.6300,-1105554.250,1.107161,0.154616
4,-540336.0600,-1105541.880,-540349.3800,-1105525.130,-540340.3800,-1105531.880,-540334.6300,-1105554.250,-540334.1900,-1105567.130,1.167280,0.069377
5,-540334.6300,-1105554.250,-540340.3800,-1105531.880,-540336.0600,-1105541.880,-540334.1900,-1105567.130,-540336.0600,-1105581.880,1.266995,0.110967
6,-540334.1900,-1105567.130,-540336.0600,-1105541.880,-540334.6300,-1105554.250,-540336.0600,-1105581.880,-540339.3800,-1105595.630,1.387779,0.071059
...,...,...,...,...,...,...,...,...,...,...,...,...
6378,704081.2154,2998587.073,704100.5248,2998592.274,704090.7949,2998589.663,704071.7641,2998583.803,704062.4201,2998580.345,0.996223,0.005559
6379,704071.7641,2998583.803,704090.7949,2998589.663,704081.2154,2998587.073,704062.4201,2998580.345,704053.1835,2998576.921,0.998217,0.005600
6380,704062.4201,2998580.345,704081.2154,2998587.073,704071.7641,2998583.803,704053.1835,2998576.921,704043.9329,2998573.270,0.990708,0.005566
6381,704053.1835,2998576.921,704071.7641,2998583.803,704062.4201,2998580.345,704043.9329,2998573.270,704034.8737,2998569.617,0.989792,0.007929


In [9]:
Y

2       1
3       1
4       1
5       1
6       1
       ..
6378    1
6379    1
6380    1
6381    1
6382    1
Name: class, Length: 6253, dtype: int64

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Creating Features

In [11]:
def compute_curvature(p1, p2, p3):
    points = np.array([p1, p2, p3])
    
    # Check if points are collinear
    if np.abs(np.cross(points[1] - points[0], points[2] - points[0])) < 1e-8:
        return 0  # Return 0 curvature for straight lines
    
    # Fit a circle to the points
    x_m = np.mean(points[:, 0])
    y_m = np.mean(points[:, 1])
    u = points[:, 0] - x_m
    v = points[:, 1] - y_m
    Suv = np.sum(u*v)
    Suu = np.sum(u**2)
    Svv = np.sum(v**2)
    Suuv = np.sum(u**2 * v)
    Suvv = np.sum(u * v**2)
    Suuu = np.sum(u**3)
    Svvv = np.sum(v**3)
    
    A = np.array([[Suu, Suv], [Suv, Svv]])
    B = np.array([Suuv + Suvv, Svvv + Suuu])/2
    
    try:
        uc, vc = np.linalg.solve(A, B)
        xc = x_m + uc
        yc = y_m + vc
        R = np.sqrt((points[:, 0]-xc)**2 + (points[:, 1]-yc)**2).mean()
        return 1/R if R > 1e-10 else 0
    except np.linalg.LinAlgError:
        # If matrix is singular, points are likely collinear
        return 0



In [12]:

def compute_local_linearity(p1, p2, p3, p4, p5):
    """
    Compute local linearity using HuberRegressor, falling back to LinearRegression if convergence fails.
    
    :param p1, p2, p3, p4, p5: Points as (x, y) tuples
    :return: R-squared value indicating linearity
    """
    points = np.array([p1, p2, p3, p4, p5])
    X = points[:, 0].reshape(-1, 1)
    Y = points[:, 1]
    
    try:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            model = HuberRegressor(max_iter=100, epsilon=1.5).fit(X, Y)
        r_squared = model.score(X, Y)
    except Exception:
        # If HuberRegressor fails, fall back to LinearRegression
        model = LinearRegression().fit(X, Y)
        r_squared = model.score(X, Y)
    
    return r_squared


def compute_angle_consistency(p1, p2, p3, p4, p5):
    """
    Compute angle consistency using circular statistics, which is more appropriate for angular data.
    
    :param p1, p2, p3, p4, p5: Points as (x, y) tuples
    :return: A measure of angle consistency
    """
    def angle_between(v1, v2):
        v1_3d = np.append(v1, 0)
        v2_3d = np.append(v2, 0)
        return np.arctan2(np.linalg.norm(np.cross(v1_3d, v2_3d)), np.dot(v1, v2))

    vectors = [
        np.array(p2) - np.array(p1),
        np.array(p3) - np.array(p2),
        np.array(p4) - np.array(p3),
        np.array(p5) - np.array(p4)
    ]
    
    angles = [angle_between(vectors[i], vectors[i+1]) for i in range(len(vectors)-1)]
    
    # Use circular mean and standard deviation
    mean_angle = circmean(angles)
    angle_deviation = np.std(angles)
    
    # Normalize consistency measure
    consistency = 1 - (angle_deviation / np.pi)
    
    return consistency


In [13]:
def compute_angle(p1, p2, p3, epsilon=1e-6):
    v1 = np.array([p2[0] - p1[0], p2[1] - p1[1], 0])
    v2 = np.array([p3[0] - p2[0], p3[1] - p2[1], 0])
    
    # Use cross product for more stable angle calculation
    cross_prod = np.cross(v1, v2)
    dot_prod = np.dot(v1, v2)
    
    angle = np.arctan2(np.linalg.norm(cross_prod), dot_prod)
    return np.degrees(angle)

def compute_distance(p1, p2):
    return np.linalg.norm(np.array(p2) - np.array(p1))

def compute_slope(p1, p2, epsilon=1e-6):
    dx = p2[0] - p1[0]
    dy = p2[1] - p1[1]
    return np.arctan2(dy, dx + epsilon)

def compute_slope_change(p1, p2, p3):
    slope1 = compute_slope(p1, p2)
    slope2 = compute_slope(p2, p3)
    return np.abs(np.arctan2(np.sin(slope2-slope1), np.cos(slope2-slope1)))

In [14]:
features = ['Angle', 'Distance','Slope_Change',
            'Curvature','Angle_Consistency','Local_Linearity']

def Calc_Features(df2, add = np.array([]), mult = np.array([])):
    # Add features to the dataframe
    angles2 = []
    distances2 = []
    curvature2 = []
    slope_change2 = []
    angle_consistency2 = []
    local_linearity2 = []
    for i in range(len(df2)):  # Starting from the second point and ending at the second-to-last point
        p5 = (df2['x_prev_2'][i], df2['y_prev_2'][i])
        p1 = (df2['x_prev_1'][i], df2['y_prev_1'][i])
        p2 = (df2['x'][i], df2['y'][i])
        p3 = (df2['x_next_1'][i], df2['y_next_1'][i])
        p4 = (df2['x_next_2'][i], df2['y_next_2'][i])

        angles2.append(compute_angle(p1, p2, p3))
        curvature2.append(compute_curvature(p1,p2,p3))
        slope_change2.append(compute_slope_change(p1,p2,p3))
        distances2.append(compute_distance(p2, p1))
        angle_consistency2.append(compute_angle_consistency(p5,p1,p2,p3,p4))
        local_linearity2.append(compute_local_linearity(p5,p1,p2,p3,p4))

    # Add the features to the dataframe
    df2['Angle'] = angles2
    df2['Curvature'] = curvature2
    df2['Slope_Change'] = slope_change2
    df2['Distance'] = distances2
    df2['Angle_Consistency'] = angle_consistency2
    df2['Local_Linearity'] =local_linearity2

    adaptive_sparsity = df2['adaptive_sparsity'].to_numpy()
    local_density_variation = df2['local_density_variation'].to_numpy()

    df2 = df2[features]
    if(add.size == 0):
        #df2 = df2.dropna()
        #df2 = df2.reset_index(drop=True)
        add = df2.mean()
        mult = df2.std()
        df2 = (df2 - add)/mult

        df2['local_density_variation'] = local_density_variation
        df2['adaptive_sparsity'] = adaptive_sparsity

        return df2, add, mult
    else: 
        df2 = (df2 - add)/mult 
        df2['local_density_variation'] = local_density_variation
        df2['adaptive_sparsity'] = adaptive_sparsity
        return df2

Running the above on the data

In [15]:
X_train = X_train.reset_index(drop=True)
X_train

Unnamed: 0,x,y,x_prev_2,y_prev_2,x_prev_1,y_prev_1,x_next_1,y_next_1,x_next_2,y_next_2,adaptive_sparsity,local_density_variation
0,-557089.1875,-1184859.000,-557066.8750,-1184825.125,-557078.4375,-1184843.625,-557099.1250,-1184870.750,-557106.8703,-1184878.105,1.707463,0.248116
1,714882.7689,3000750.259,714901.4292,3000757.134,714892.1186,3000753.683,714873.3187,3000746.798,714864.0134,3000743.385,1.001049,0.005935
2,-694296.6513,-1076342.380,-694305.7486,-1076329.972,-694297.8111,-1076341.349,-694288.2860,-1076349.816,-694280.5000,-1076356.125,0.637216,0.503171
3,709967.9559,2999519.681,709982.5220,2999533.296,709975.3239,2999526.451,709960.6162,2999512.938,709952.9966,2999506.621,0.998646,0.004035
4,-538121.0000,-1106481.500,-538244.3800,-1106472.500,-538183.4400,-1106477.250,-538062.0000,-1106485.250,-538024.3800,-1106486.750,6.085176,0.184342
...,...,...,...,...,...,...,...,...,...,...,...,...
4997,727145.5670,3005506.763,727164.2374,3005512.901,727154.9098,3005509.901,727136.3351,3005503.627,727126.9978,3005500.401,0.980285,0.005121
4998,714016.2677,3000471.445,714036.0129,3000472.538,714026.0868,3000471.816,714006.4623,3000471.137,713996.5095,3000470.811,0.981817,0.006958
4999,713668.8530,3000458.203,713688.8917,3000459.070,713678.8237,3000458.642,713658.8144,3000457.731,713649.0470,3000457.298,1.001502,0.011779
5000,712046.6611,3000391.779,712066.5763,3000392.738,712056.5963,3000392.260,712036.6535,3000391.294,712026.7738,3000390.929,0.998309,0.005042


In [16]:
X_train, add, mult = Calc_Features(X_train)
X_train

Unnamed: 0,Angle,Distance,Slope_Change,Curvature,Angle_Consistency,Local_Linearity,local_density_variation,adaptive_sparsity
0,1.432643,0.082735,1.432644,2.103039,-0.504745,0.079726,0.248116,1.707463
1,-0.452992,-0.275854,-0.452991,-0.159695,0.539171,0.074873,0.005935,1.001049
2,-0.452950,-0.618218,-0.452958,-0.159696,-4.118770,0.079438,0.503171,0.637216
3,-0.451854,-0.273855,-0.451853,-0.159636,-0.362942,0.079142,0.004035,0.998646
4,-0.361229,1.867804,-0.361228,-0.159582,0.244371,0.033109,0.184342,6.085176
...,...,...,...,...,...,...,...,...
4997,-0.382987,-0.279978,-0.382986,-0.157059,0.429297,0.074030,0.005121,0.980285
4998,-0.322661,-0.281183,-0.322661,-0.099522,-0.067317,-0.344867,0.006958,0.981817
4999,-0.392100,-0.274900,-0.392100,-0.157142,0.590501,-0.307953,0.011779,1.001502
5000,-0.452353,-0.276266,-0.452353,-0.159695,0.373533,-0.285926,0.005042,0.998309


In [17]:
add

Angle                   1.264887
Distance               16.729252
Slope_Change            0.022076
Curvature               0.002764
Angle_Consistency       0.995764
Local_Linearity     -1780.909941
dtype: float64

In [18]:
mult

Angle                    2.789991
Distance                24.550333
Slope_Change             0.048695
Curvature                0.017309
Angle_Consistency        0.006832
Local_Linearity      22345.900752
dtype: float64

Running on testing data set

In [19]:
X_test = X_test.reset_index(drop=True)
X_test

Unnamed: 0,x,y,x_prev_2,y_prev_2,x_prev_1,y_prev_1,x_next_1,y_next_1,x_next_2,y_next_2,adaptive_sparsity,local_density_variation
0,-615656.2512,-1035478.664,-615833.5224,-1035542.693,-615664.9722,-1035481.814,-615570.3387,-1035444.089,-615482.6842,-1035408.814,5.094062,0.639934
1,711462.1705,3000367.528,711482.2270,3000368.808,711472.1674,3000368.117,711452.2294,3000367.044,711442.3705,3000366.564,0.998356,0.007850
2,715264.8311,3000890.272,715283.4263,3000896.987,715274.1913,3000893.652,715255.5167,3000886.881,715246.2015,3000883.479,0.993212,0.004982
3,-686591.4443,-1088006.474,-686582.6250,-1087967.875,-686587.6875,-1087987.875,-686591.5000,-1088006.750,-686598.3103,-1088049.343,0.962809,0.731952
4,-586612.6762,-1198410.930,-586730.5483,-1198217.255,-586673.7951,-1198310.388,-586555.5261,-1198504.725,-586553.3564,-1198508.284,11.374801,0.550517
...,...,...,...,...,...,...,...,...,...,...,...,...
1246,-587184.0450,-1197458.296,-587219.1024,-1197387.784,-587203.0951,-1197420.593,-587170.9481,-1197483.431,-587161.9523,-1197500.365,3.529246,0.275405
1247,705420.1322,2998818.959,705439.8730,2998822.314,705429.9131,2998820.621,705410.3210,2998817.389,705400.5115,2998815.839,0.992856,0.007545
1248,732765.5487,3006890.824,732785.4026,3006893.536,732775.4692,3006892.180,732755.7221,3006889.481,732745.8421,3006888.187,0.996535,0.004263
1249,-773358.0789,-1001639.815,-773180.0977,-1001841.988,-773290.3455,-1001717.074,-773403.5873,-1001587.957,-773455.1327,-1001528.456,8.587041,0.364739


In [20]:
X_test = Calc_Features(X_test, add=add, mult=mult)
X_test

Unnamed: 0,Angle,Distance,Slope_Change,Curvature,Angle_Consistency,Local_Linearity,local_density_variation,adaptive_sparsity
0,0.285872,-0.303735,0.285873,-0.159649,-0.170428,0.079218,0.639934,5.094062
1,-0.243858,-0.273520,-0.243858,-0.122352,0.400911,-0.167433,0.007850,0.998356
2,-0.399704,-0.276065,-0.399704,-0.155919,0.570234,0.074749,0.004982,0.993212
3,-0.449859,0.091460,-0.449788,-0.159696,-0.367611,0.079720,0.731952,0.962809
4,-0.432171,4.111235,-0.432170,-0.159675,0.602014,0.079517,0.550517,11.374801
...,...,...,...,...,...,...,...,...
1246,-0.196615,1.039219,-0.196614,-0.158121,0.501320,0.079550,0.275405,3.529246
1247,-0.255410,-0.277314,-0.255409,-0.104789,0.427024,0.052167,0.007545,0.992856
1248,-0.453027,-0.273581,-0.453027,-0.159695,0.499108,0.041371,0.004263,0.996535
1249,-0.443474,3.503695,-0.443473,-0.159693,0.507421,0.079528,0.364739,8.587041


## Running the Model

In [21]:
# Initialize and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [22]:
# Make predictions and evaluate
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       977
           1       0.86      0.74      0.80       274

    accuracy                           0.92      1251
   macro avg       0.90      0.86      0.87      1251
weighted avg       0.92      0.92      0.92      1251



In [23]:
# Optionally, print feature importances
print("Feature Importances:", model.feature_importances_)

Feature Importances: [0.19117238 0.06709646 0.21660157 0.08870232 0.18581728 0.07972226
 0.0788759  0.09201183]


Saving normalisation coefficients

## Training Model on Full Dataset

In [24]:
X = X.reset_index(drop=True)
Y_full = DF['class'].reset_index(drop=True)

In [25]:
X_full, add_full, mult_full = Calc_Features(X)
X_full

Unnamed: 0,Angle,Distance,Slope_Change,Curvature,Angle_Consistency,Local_Linearity,local_density_variation,adaptive_sparsity
0,7.013732,-0.041593,7.013732,0.413413,-8.807065,0.080631,0.162731,1.354057
1,9.982498,-0.226178,9.982499,3.390455,-3.719959,0.081329,0.154616,1.107161
2,5.426612,-0.240553,5.426613,2.218275,-7.649587,0.081573,0.069377,1.167280
3,1.173363,-0.177731,1.173363,0.290935,-3.412441,0.081647,0.110967,1.266995
4,2.766435,-0.160199,2.766436,0.473035,-0.888423,0.081677,0.071059,1.387779
...,...,...,...,...,...,...,...,...
6248,0.934213,-0.279628,0.934214,0.499027,-0.680745,0.073794,0.005559,0.996223
6249,-0.023537,-0.276503,-0.023536,-0.064965,-0.701971,0.075237,0.005600,0.998217
6250,-0.441487,-0.278021,-0.441487,-0.165530,0.173935,0.076233,0.005566,0.990708
6251,-0.032383,-0.282554,-0.032383,1.760176,0.231372,0.076658,0.007929,0.989792


In [26]:
# Initialize and train the Random Forest Classifier
model_full = RandomForestClassifier(n_estimators=100, random_state=42)
model_full.fit(X_full, Y_full)

In [27]:
import joblib

# Save the model to a file
joblib.dump(model_full, 'random_forest_model.pkl')

#Save Coefficients
add_full.to_pickle('add_coef.pkl')
mult_full.to_pickle('mult_coef.pkl')

print("Model saved!")

Model saved!


In [None]:
add_full

Angle                   1.290686
Distance               16.863415
Slope_Change            0.022527
Curvature               0.002703
Angle_Consistency       0.995712
Local_Linearity     -1642.256571
dtype: float64

In [None]:
mult_full

Angle                    2.852500
Distance                24.818586
Slope_Change             0.049786
Curvature                0.016319
Angle_Consistency        0.006891
Local_Linearity      20109.379424
dtype: float64

In [None]:
# Make predictions and evaluate
y_pred_full = model_full.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred_full))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       977
           1       0.94      0.89      0.91       274

    accuracy                           0.96      1251
   macro avg       0.96      0.94      0.95      1251
weighted avg       0.96      0.96      0.96      1251



# Testing Py Script

In [None]:
test3 = pd.read_csv("output.csv")
test3['Curve']

0       1
1       1
2       1
3       1
4       1
       ..
3995    1
3996    1
3997    1
3998    1
3999    1
Name: Curve, Length: 4000, dtype: int64