In [None]:
from scipy.io import arff

# Data Processing
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import pandas as pd

# Replace 'your_file.arff' with the path to your ARFF file
arff_file_path = r"C:\Users\ronan\Fourth_Year\Data_Science\Data_Science\Random Forest\kick.arff"


# Load the ARFF file
data, meta = arff.loadarff(arff_file_path)
df = pd.DataFrame(data)

# Save it as CSV
df.to_csv('converted_file.csv', index=False)
print("File successfully converted to CSV and saved as 'converted_file.csv'")

File successfully converted to CSV and saved as 'converted_file.csv'


In [7]:
# Load the dataset
data = pd.read_csv('converted_file.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(data.head())

# Display information about the dataset
print("\nDataset Information:")
print(data.info())

First 5 rows of the dataset:
  IsBadBuy     PurchDate   Auction  VehYear  VehicleAge      Make  \
0     b'0'  1.260144e+09  b'ADESA'   2006.0         3.0  b'MAZDA'   
1     b'0'  1.260144e+09  b'ADESA'   2004.0         5.0  b'DODGE'   
2     b'0'  1.260144e+09  b'ADESA'   2005.0         4.0  b'DODGE'   
3     b'0'  1.260144e+09  b'ADESA'   2004.0         5.0  b'DODGE'   
4     b'0'  1.260144e+09  b'ADESA'   2005.0         4.0   b'FORD'   

                    Model    Trim              SubModel      Color  ...  \
0               b'MAZDA3'    b'i'         b'4D SEDAN I'     b'RED'  ...   
1  b'1500 RAM PICKUP 2WD'   b'ST'  b'QUAD CAB 4.7L SLT'   b'WHITE'  ...   
2           b'STRATUS V6'  b'SXT'   b'4D SEDAN SXT FFV'  b'MAROON'  ...   
3                 b'NEON'  b'SXT'           b'4D SEDAN'  b'SILVER'  ...   
4                b'FOCUS'  b'ZX3'       b'2D COUPE ZX3'  b'SILVER'  ...   

  MMRCurrentRetailAveragePrice MMRCurrentRetailCleanPrice PRIMEUNIT  AUCGUART  \
0                      1

In [8]:
# Drop unnecessary columns (update with the actual column names after inspecting the data)
columns_to_drop = ['PurchDate', 'SubModel', 'VNZIP1', 'BYRNO', 'AUCGUART', 'PRIMEUNIT']
data_cleaned = data.drop(columns=columns_to_drop, axis=1)

# Display the cleaned dataset to confirm columns are dropped
print("Cleaned Dataset (First 5 rows):")
print(data_cleaned.head())

Cleaned Dataset (First 5 rows):
  IsBadBuy   Auction  VehYear  VehicleAge      Make                   Model  \
0     b'0'  b'ADESA'   2006.0         3.0  b'MAZDA'               b'MAZDA3'   
1     b'0'  b'ADESA'   2004.0         5.0  b'DODGE'  b'1500 RAM PICKUP 2WD'   
2     b'0'  b'ADESA'   2005.0         4.0  b'DODGE'           b'STRATUS V6'   
3     b'0'  b'ADESA'   2004.0         5.0  b'DODGE'                 b'NEON'   
4     b'0'  b'ADESA'   2005.0         4.0   b'FORD'                b'FOCUS'   

     Trim      Color Transmission WheelTypeID  ...  \
0    b'i'     b'RED'      b'AUTO'        b'1'  ...   
1   b'ST'   b'WHITE'      b'AUTO'        b'1'  ...   
2  b'SXT'  b'MAROON'      b'AUTO'        b'2'  ...   
3  b'SXT'  b'SILVER'      b'AUTO'        b'1'  ...   
4  b'ZX3'  b'SILVER'    b'MANUAL'        b'2'  ...   

  MMRAcquisitionRetailAveragePrice  MMRAcquisitonRetailCleanPrice  \
0                          11636.0                        13600.0   
1                          108


markdown
Copy code
### **Explanation of Dropped Columns**

1. **`PurchDate`**:
   - **Reason**: The purchase date does not provide any significant information about whether the vehicle is a "bad buy" or not. If temporal information is relevant, it can be derived (e.g., age of the vehicle at purchase) rather than using the full date.

2. **`SubModel`**:
   - **Reason**: This column may contain detailed textual data about the vehicle's submodel, which is often redundant when the `Model` column already exists. Including both could lead to overfitting or unnecessary complexity.

3. **`VNZIP1`**:
   - **Reason**: This column likely represents the ZIP code where the vehicle was purchased or sold. ZIP codes can add high cardinality without meaningful information for this task. 

4. **`BYRNO`**:
   - **Reason**: This column appears to be an identifier for the buyer or transaction. Such identifiers are unique but do not contribute predictive value to the model.

5. **`AUCGUART`**:
   - **Reason**: This column contains missing or placeholder values (e.g., `b'?'`) and may not provide meaningful information.

6. **`PRIMEUNIT`**:
   - **Reason**: Similar to `AUCGUART`, this column seems to have placeholder or unclear values (`b'?'`) and lacks a clear connection to predicting the target variable.

---

### **Why Drop Columns?**
Dropping these columns simplifies the dataset by removing irrelevant, redundant, or high-cardinality data that does not meaningfully contribute to the prediction task. This reduces noise and improves the efficiency of the model training process.

In [10]:
# Identify categorical columns for one-hot encoding
categorical_columns = ['Auction', 'Make', 'Model', 'Color', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName']

# Apply one-hot encoding
data_encoded = pd.get_dummies(data_cleaned, columns=categorical_columns, drop_first=True)

# Display the dimensions (number of rows and columns) of the encoded dataset
print(f"Encoded Dataset Dimensions: {data_encoded.shape}")

Encoded Dataset Dimensions: (72983, 1156)


In [11]:
# Define features (X) and target (y)
X = data_encoded.drop('IsBadBuy', axis=1)
y = data_encoded['IsBadBuy']

# Display the shapes of X and y to confirm the split
print(f"Features Shape: {X.shape}")
print(f"Target Shape: {y.shape}")

Features Shape: (72983, 1155)
Target Shape: (72983,)


In [12]:
from sklearn.model_selection import train_test_split

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

# Display the shapes of the resulting datasets
print(f"Training Features Shape: {X_train.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print(f"Testing Target Shape: {y_test.shape}")

Training Features Shape: (14596, 1155)
Training Target Shape: (14596,)
Testing Features Shape: (58387, 1155)
Testing Target Shape: (58387,)
