In [1]:
# Import necessary libraries
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = 'C:\\Users\\William\\My Drive\\College\\MEng in Connected & Autonomous Vehicles\\Semester_3\\Machine Learning (COMP09012)\\Assignment_1\\peugeot_207_02.csv'  # Replace with your dataset path
data = pd.read_csv(file_path, delimiter=';')

# Data cleaning and preparation
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

# Encode categorical labels
data['roadSurface'] = data['roadSurface'].astype('category').cat.codes
data['traffic'] = data['traffic'].astype('category').cat.codes
data['drivingStyle'] = data['drivingStyle'].astype('category').cat.codes

# Feature scaling
scaler = StandardScaler()
features = data.loc[:, data.columns.difference(['drivingStyle', 'Unnamed: 0'])]  # Exclude target and index column
features_scaled = scaler.fit_transform(features)
target = data['drivingStyle']  # Target variable is 'drivingStyle'

# Check initial class distribution
print("Initial Class Distribution in Target Variable:\n", target.value_counts())

# Apply SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features_scaled, target)

# Check class distribution after SMOTE
print("\nClass Distribution After SMOTE:\n", pd.Series(y_resampled).value_counts())

# Split the resampled dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train SVM with linear kernel
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output results
print("\nAccuracy:", accuracy)
print("\nClassification Report:\n", report)


Initial Class Distribution in Target Variable:
 drivingStyle
1    4259
0     187
Name: count, dtype: int64

Class Distribution After SMOTE:
 drivingStyle
1    4259
0    4259
Name: count, dtype: int64

Accuracy: 0.7423708920187794

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.81      0.76       861
           1       0.77      0.68      0.72       843

    accuracy                           0.74      1704
   macro avg       0.75      0.74      0.74      1704
weighted avg       0.75      0.74      0.74      1704

