In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the data
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [3]:
# Check for missing values in the dataset
missing_values = train_data.isnull().sum()
print(missing_values)

Unnamed: 0           0
id                   0
year                 0
manufacturer     11342
model                0
condition       116104
cylinders       119300
fuel              1453
odometer             0
title_status      5066
transmission         0
drive            88087
size            207684
type             62596
paint_color      87113
state                0
lat               4403
long              4403
posting_date         0
price                0
dtype: int64


In [4]:
# Handling missing values for numerical columns (e.g., cylinders, lat, long)
numerical_cols = ['lat', 'long']

# Create a SimpleImputer for numerical columns using median strategy
numerical_imputer = SimpleImputer(strategy='median')

# Impute missing values for numerical columns
train_data[numerical_cols] = numerical_imputer.fit_transform(train_data[numerical_cols])

# Handling missing values for the 'cylinders' column
# Extract the numeric part of the 'cylinders' column
train_data['cylinders'] = train_data['cylinders'].str.extract(r'(\d+)')

# Convert the 'cylinders' column to numeric
train_data['cylinders'] = pd.to_numeric(train_data['cylinders'], errors='coerce')

# Create a new SimpleImputer for the 'cylinders' column using median strategy
cylinders_imputer = SimpleImputer(strategy='median')

# Impute missing values for the 'cylinders' column
train_data['cylinders'] = cylinders_imputer.fit_transform(train_data[['cylinders']])

# Handling missing values for categorical columns (e.g., manufacturer, condition, fuel, etc.)
categorical_cols = ['manufacturer', 'condition', 'fuel', 'title_status', 'drive', 'size', 'type', 'paint_color']

# Create a SimpleImputer for categorical columns using the most frequent strategy
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Impute missing values for categorical columns
train_data[categorical_cols] = categorical_imputer.fit_transform(train_data[categorical_cols])


In [5]:
# Encode categorical columns using OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_features = pd.DataFrame(encoder.fit_transform(train_data[categorical_cols]))
encoded_features.columns = encoder.get_feature_names(categorical_cols)

# Concatenate the encoded features with the original numerical columns
train_data_encoded = pd.concat([train_data[numerical_cols], encoded_features], axis=1)



In [6]:
# Define the target variable (price)
y = train_data['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data_encoded, y, test_size=0.2, random_state=42)

In [26]:
# Define the models to be tested
models = {
#     'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
#     'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR(),
#     'Linear Regression': LinearRegression(),
#     'Ridge Regression': Ridge(),
#     'Lasso Regression': Lasso(),
#     'ElasticNet Regression': ElasticNet(),
#     'K-Nearest Neighbors': KNeighborsRegressor()
}

In [None]:
# Train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[model_name] = {'MSE': mse, 'R-squared': r2}
    print(results[model_name])

In [None]:
# Display the results
for model_name, result in results.items():
    print(f"{model_name}:")
    print(f"  MSE: {result['MSE']:.2f}")
    print(f"  R-squared: {result['R-squared']:.4f}")
    print()