In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = 'C:\\Users\\William\\My Drive\\College\\MEng in Connected & Autonomous Vehicles\\Semester_3\\Machine Learning (COMP09012)\\Assignment_1\\peugeot_207_01.csv'  # Update with your file path if needed
raw_dataset = pd.read_csv(file_path, delimiter=';')

# Data cleaning
# Impute missing values for numeric columns using mean
imputer = SimpleImputer(strategy="mean")
numeric_cols = raw_dataset.select_dtypes(include=["float64", "int64"]).columns
raw_dataset[numeric_cols] = imputer.fit_transform(raw_dataset[numeric_cols])

# Encode categorical variables
encoder = LabelEncoder()
for col in ['roadSurface', 'traffic', 'drivingStyle']:
    raw_dataset[col] = encoder.fit_transform(raw_dataset[col])

# Feature selection
features = raw_dataset.loc[:, raw_dataset.columns.difference(['drivingStyle', 'Unnamed: 0'])]  # Exclude target and identifier
target = raw_dataset['drivingStyle']  # Target variable

# Scale the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(features_scaled, target)

# Split the resampled dataset into training and testing sets
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Train Logistic Regression with class weighting
model_weighted = LogisticRegression(max_iter=200, class_weight='balanced')
model_weighted.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_resampled = model_weighted.predict(X_test_resampled)

# Evaluate the model
accuracy_resampled = accuracy_score(y_test_resampled, y_pred_resampled)
report_resampled = classification_report(y_test_resampled, y_pred_resampled)

# Output results
print("Accuracy after addressing imbalance:", accuracy_resampled)
print("Classification Report after addressing imbalance:\n", report_resampled)


Accuracy after addressing imbalance: 0.7518626498218335
Classification Report after addressing imbalance:
               precision    recall  f1-score   support

           0       0.74      0.77      0.75      1532
           1       0.76      0.74      0.75      1555

    accuracy                           0.75      3087
   macro avg       0.75      0.75      0.75      3087
weighted avg       0.75      0.75      0.75      3087

