In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv(r'C:\Users\omarf\OneDrive\Documents\train.csv')
train.head()

In [None]:
test = pd.read_csv(r'C:\Users\omarf\OneDrive\Documents\test.csv')
test.head()

In [None]:
submission = pd.read_csv(r'C:\Users\omarf\OneDrive\Documents\sample_submission.csv')
submission.head()

In [None]:
train.info()
train.isnull().sum()

In [None]:
fuel_types = train['fuel_type'].unique()
accident_types = train['accident'].unique()
clean_title_types = train['clean_title'].unique()

print("Fuel Types:", fuel_types)
print("Accident Types:", accident_types)
print("Clean Title Types:", clean_title_types)

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# Combine train and test for consistent encoding and imputation
train_test = pd.concat([train.drop(columns=['price']), test], axis=0, ignore_index=True)

# Encode categorical variables
cat_cols = train_test.select_dtypes(include=['object']).columns
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train_test[col] = train_test[col].astype(str)
    train_test[col] = le.fit_transform(train_test[col])
    label_encoders[col] = le

# KNN Imputation
imputer = KNNImputer(n_neighbors=5)
train_test_imputed = imputer.fit_transform(train_test)

# Convert back to DataFrame
train_test_imputed = pd.DataFrame(train_test_imputed, columns=train_test.columns)

# Split back into train and test
train_imputed = train_test_imputed.iloc[:len(train), :]
test_imputed = train_test_imputed.iloc[len(train):, :]

# Add back the price column to train
train_imputed['price'] = train['price'].values

In [None]:
# Feature Engineering for Regression Models

# Start with a copy to avoid modifying the original
train_fe = train_imputed.copy()

# 1. Create new features
# Age of the car
train_fe['car_age'] = 2024 - train_fe['model_year']

# Mileage per year
train_fe['milage_per_year'] = train_fe['milage'] / (train_fe['car_age'] + 1)

# 2. Interaction features
# Brand x Model interaction (as numeric product)
train_fe['brand_model_interaction'] = train_fe['brand'] * train_fe['model']

# Engine x Transmission interaction
train_fe['engine_transmission'] = train_fe['engine'] * train_fe['transmission']

# 3. Polynomial features
train_fe['milage_squared'] = train_fe['milage'] ** 2
train_fe['car_age_squared'] = train_fe['car_age'] ** 2

# 4. Aggregated categorical statistics (mean price by brand, model, etc.)
brand_mean_price = train_fe.groupby('brand')['price'].mean()
model_mean_price = train_fe.groupby('model')['price'].mean()

train_fe['brand_mean_price'] = train_fe['brand'].map(brand_mean_price)
train_fe['model_mean_price'] = train_fe['model'].map(model_mean_price)

# 5. Drop columns that are not useful for regression (if any)
# (All columns are numeric and suitable, so no drop needed here)

# 6. Fill any remaining NaNs (shouldn't be any, but just in case)
train_fe = train_fe.fillna(0)

# train_fe is now ready for regression algorithms
train_fe.head()

In [None]:
from sklearn.metrics import mean_squared_error

# Prepare features and target
X_train_full = train_fe.drop(columns=['id', 'price'])
y_train_full = train_fe['price']

# Linear Regression
lr_full = LinearRegression()
lr_full.fit(X_train_full, y_train_full)
y_pred_lr_full = lr_full.predict(X_train_full)
rmse_lr_full = mean_squared_error(y_train_full, y_pred_lr_full, squared=False)
print(f"Linear Regression RMSE on train set: {rmse_lr_full:.2f}")

# Decision Tree Regressor
dt_full = DecisionTreeRegressor(random_state=42)
dt_full.fit(X_train_full, y_train_full)
y_pred_dt_full = dt_full.predict(X_train_full)
rmse_dt_full = mean_squared_error(y_train_full, y_pred_dt_full, squared=False)
print(f"Decision Tree RMSE on train set: {rmse_dt_full:.2f}")

In [None]:
import IPython

# Predict prices for the test set using the trained Linear Regression model
# Prepare test features (align columns with X_train_full)
test_fe = test_imputed.copy()

# Feature engineering for test set (same as train_fe)
test_fe['car_age'] = 2024 - test_fe['model_year']
test_fe['milage_per_year'] = test_fe['milage'] / (test_fe['car_age'] + 1)
test_fe['brand_model_interaction'] = test_fe['brand'] * test_fe['model']
test_fe['engine_transmission'] = test_fe['engine'] * test_fe['transmission']
test_fe['milage_squared'] = test_fe['milage'] ** 2
test_fe['car_age_squared'] = test_fe['car_age'] ** 2
test_fe['brand_mean_price'] = test_fe['brand'].map(brand_mean_price)
test_fe['model_mean_price'] = test_fe['model'].map(model_mean_price)
test_fe = test_fe.fillna(0)

# Select the same columns as X_train_full
X_test = test_fe[X_train_full.columns]

# Predict
y_test_pred_lr = lr_full.predict(X_test)

# Prepare submission DataFrame
submission_lr = pd.DataFrame({
    'id': test['id'],
    'price': y_test_pred_lr
})

# Save to CSV
submission_lr.to_csv('submission_lr.csv', index=False)

# Provide a download link (for Jupyter)
IPython.display.display(IPython.display.FileLink('submission_lr.csv'))