# Random forest regression model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

### Load the data

In [66]:
df_original = pd.read_csv("used_cars.csv")
df = pd.read_csv("train.csv")

df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


### Dealing with empty data

In [67]:
rows = df.shape[0]
print(f"Number of examples: {rows}")

Number of examples: 188533


### Getting to know the data

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


## Filling all empty information

In [69]:
df.fillna('missing', inplace=True)

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     188533 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      188533 non-null  object
 11  clean_title   188533 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


## Encoding the labels

In [71]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# List of columns to encode
categorical_columns = ['brand', 'fuel_type', 'transmission', 'model', 
                       'ext_col', 'int_col', 'accident', 'engine', 'clean_title']

# Apply label encoding to each categorical column
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

print("Label Encoded DataFrame:")
print(df)


Label Encoded DataFrame:
            id  brand  model  model_year  milage  fuel_type  engine  \
0            0     31    495        2007  213000          2     116   
1            1     28    930        2002  143250          2     366   
2            2      9   1575        2002  136731          1     640   
3            3     16    758        2017   19500          2     863   
4            4     36   1077        2021    7388          2     259   
...        ...    ...    ...         ...     ...        ...     ...   
188528  188528      8    604        2017   49000          2     866   
188529  188529     36    206        2018   28600          2     770   
188530  188530     36    223        2021   13650          2     921   
188531  188531      3   1471        2022   13895          2     512   
188532  188532     43   1028        2016   59500          2     364   

        transmission  ext_col  int_col  accident  clean_title  price  
0                 38      312       71         1   

## Splitting and Normalizing the data

In [73]:
from sklearn.model_selection import train_test_split
X = df.drop(['price', 'id'], axis=1)
y = df['price']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Model

In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Define a function to construct and train the Random Forest model
def train_random_forest(X_train, y_train, X_val, y_val, n_estimators=100, max_depth=None):
    # Initialize the Random Forest model
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on training data
    y_train_pred = model.predict(X_train)
    
    # Calculate train metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    
    print(f"Train MSE: {train_mse}")
    print(f"Train MAE: {train_mae}")
    
    # Predict on validation data
    y_val_pred = model.predict(X_val)
    
    # Calculate validation metrics
    val_mse = mean_squared_error(y_val, y_val_pred)
    val_mae = mean_absolute_error(y_val, y_val_pred)
    
    print(f"Validation MSE: {val_mse}")
    print(f"Validation MAE: {val_mae}")
    
    return model, train_mse, train_mae, val_mse, val_mae

# Train the Random Forest model
model, train_mse, train_mae, val_mse, val_mae = train_random_forest(X_train, y_train, X_val, y_val, n_estimators=1000, max_depth=6)

# Evaluate the model on the test set
y_test_pred = model.predict(X_test)

# Calculate test MSE and MAE
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Test MSE: {test_mse}")
print(f"Test MAE: {test_mae}")



Train MSE: 5208385211.181616
Train MAE: 19958.5923162194
Validation MSE: 4013633670.0266194
Validation MAE: 19938.64673782685
Test MSE: 5457154657.38305
Test MAE: 20488.4877413339


## Save the results

In [78]:
import pandas as pd

# Load the test dataset

test_df = pd.read_csv("test.csv")
numeric_df = test_df.select_dtypes(include=['number'])
non_numeric_df = test_df.select_dtypes(exclude=['number'])

# Optionally handle NaNs in non-numeric columns (e.g., with a specific value)
non_numeric_df = non_numeric_df.fillna('Missing')

# Combine the numeric and non-numeric DataFrames back together
test_df = pd.concat([numeric_df, non_numeric_df], axis=1)
# Handle the categorical encoding for the test set using the same encoder
for col in categorical_columns:
    test_df[col] = label_encoder.fit_transform(test_df[col])

# Drop any columns not needed for prediction
X_test_pred = test_df.drop(['id'], axis=1)

# Normalize the test data using the same scaler
X_test_pred = scaler.fit_transform(X_test_pred)

# Make predictions on the test data
predictions = model.predict(X_test_pred)

# Prepare a DataFrame to save the results
results = pd.DataFrame({
    'id': test_df['id'],
    'price': predictions.flatten()
})

# Save the results to a CSV file
results.to_csv("prediction_rf.csv", index=False)