In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Function to clean the dataset
def clean_dataset(data):

    # Drop unnecessary columns (if any)
    data = data.drop(columns=['Unnamed: 0'], errors='ignore')

    # Check for missing values and drop rows with missing target or impute if needed
    if 'drivingStyle' in data.columns:
        data = data.dropna(subset=['drivingStyle'])  # Ensure target has no missing values

    # Impute or drop remaining missing values (if any)
    data = data.fillna(data.median(numeric_only=True))  # Impute numeric columns with median

    # Ensure proper data types for categorical variables
    for col in data.select_dtypes(include=['object']).columns:
        data[col] = data[col].astype('category').cat.codes

    return data

# Function to preprocess the dataset
def preprocess_data(data, target_column, exclude_columns=None):

    # Encode categorical variables
    for col in data.select_dtypes(include=['object', 'category']).columns:
        data[col] = data[col].astype('category').cat.codes

    # Exclude specific columns if provided
    exclude_columns = exclude_columns or []
    exclude_columns.append(target_column)  # Ensure target is excluded from features
    features = data.loc[:, data.columns.difference(exclude_columns)]
    
    # Feature scaling
    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    # Define the target variable
    target = data[target_column]
    
    return features, target

# Function to train and evaluate the SVM model
def train_and_evaluate_svm(features, target, test_size=0.2, random_state=42):

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=random_state)

    # Initialise and train the SVM model with class balancing
    model = SVC(kernel='linear', random_state=random_state, class_weight='balanced')
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

# Load and clean the dataset
file_path = 'C:\\Users\\William\\My Drive\\College\\MEng in Connected & Autonomous Vehicles\\Semester_3\\Machine Learning (COMP09012)\\Assignment_1\\opel_corsa_02.csv'
raw_data = pd.read_csv(file_path, delimiter=';')
cleaned_data = clean_dataset(raw_data)

# Preprocess the dataset
target_column = 'drivingStyle'
exclude_columns = []  # Exclude identifier column
features, target = preprocess_data(cleaned_data, target_column, exclude_columns)

# Train and evaluate the SVM model
svm_accuracy, svm_report = train_and_evaluate_svm(features, target)

# Output results for SVM
print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", svm_report)


SVM Accuracy: 0.6532356532356532
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.78      0.47       158
           1       0.92      0.62      0.74       661

    accuracy                           0.65       819
   macro avg       0.63      0.70      0.60       819
weighted avg       0.81      0.65      0.69       819

