# Building a Model for Prediction


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/raw/used_cars_engineered.csv')

In [3]:
# Select features and target
features = [
    'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine_size',
    'transmission', 'ext_col', 'int_col', 'car_age', 'milage_per_year'
]
target = 'price'

# Drop rows with missing values in selected columns
df_model = df[features + [target]].dropna()

# One-hot encode categorical features
X = pd.get_dummies(df_model[features], drop_first=True)
y = df_model[target]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("Linear Regression R2:", r2_score(y_test, y_pred_lr))
print("Linear Regression RMSE:", mean_squared_error(y_test, y_pred_lr))

Linear Regression R2: 0.6433030472292012
Linear Regression RMSE: 1266053486.0010257


In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

# Scale features for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

print("KNN Regression R2:", r2_score(y_test, y_pred_knn))
print("KNN Regression RMSE:", mean_squared_error(y_test, y_pred_knn))

KNN Regression R2: 0.24593267099240057
KNN Regression RMSE: 2676472459.7549453


In [9]:
sample = {
    'brand': 'toyota',
    'model': 'camry se',
    'model_year': 2018,
    'milage': 30000,
    'fuel_type': 'gasoline',
    'engine_size': 2.5,
    'transmission': 'a/t',
    'ext_col': 'gray',
    'int_col': 'black',
    'car_age': 7,
    'milage_per_year': 4285.7
}

In [None]:
# Just creating a new dataframe for the sample
# I just created
sample_df = pd.DataFrame([sample])

# OneHotEncoder --- to use the same columns as training data
sample_X = pd.get_dummies(sample_df)
sample_X = sample_X.reindex(columns=X.columns, fill_value=0)

In [11]:
pred_lr = lr.predict(sample_X)
print("Linear Regression predicted price:", pred_lr[0])

Linear Regression predicted price: 35383.52864581323


In [12]:
sample_X_scaled = scaler.transform(sample_X)
pred_knn = knn.predict(sample_X_scaled)
print("KNN predicted price:", pred_knn[0])

KNN predicted price: 37878.0
