# Essential Libraries

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy import stats
import joblib 


# Load Dataset

In [22]:
# Cell 2: Load the Dataset
# Load the dataset
file_path = 'CarsData.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)


data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97712 entries, 0 to 97711
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         97712 non-null  object 
 1   year          97712 non-null  int64  
 2   price         97712 non-null  int64  
 3   transmission  97712 non-null  object 
 4   mileage       97712 non-null  int64  
 5   fuelType      97712 non-null  object 
 6   tax           97712 non-null  int64  
 7   mpg           97712 non-null  float64
 8   engineSize    97712 non-null  float64
 9   Manufacturer  97712 non-null  object 
dtypes: float64(2), int64(4), object(4)
memory usage: 7.5+ MB


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Manufacturer
0,I10,2017,7495,Manual,11630,Petrol,145,60.1,1.0,hyundi
1,Polo,2017,10989,Manual,9200,Petrol,145,58.9,1.0,volkswagen
2,2 Series,2019,27990,Semi-Auto,1614,Diesel,145,49.6,2.0,BMW
3,Yeti Outdoor,2017,12495,Manual,30960,Diesel,150,62.8,2.0,skoda
4,Fiesta,2017,7999,Manual,19353,Petrol,125,54.3,1.2,ford


In [23]:
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

# Impute missing values for numerical columns with the median
numerical_columns = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
categorical_columns = ['model', 'transmission', 'fuelType', 'Manufacturer']

# Create an imputer for numerical and categorical columns
numerical_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')

# Apply the imputers
data[numerical_columns] = numerical_imputer.fit_transform(data[numerical_columns])
data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])


Missing values per column:
 model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
Manufacturer    0
dtype: int64


# Handle Outliers

In [24]:
z_scores = np.abs(stats.zscore(data[numerical_columns]))
outliers = (z_scores > 3)  # Using a threshold of 3 for outliers

# Remove rows with outliers
data = data[~outliers.any(axis=1)]


# Features and target

In [25]:
X = data.drop(columns=['price'])
y = data['price']

# Define numeric and categorical columns
numeric_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
categorical_features = ['model', 'transmission', 'fuelType', 'Manufacturer']

# Preprocessing for numeric data: Scaling
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: OneHot Encoding
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# pre-processing pipelines

In [26]:
# Preprocessing for numeric data
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Model Pipeline

In [27]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='linear'))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)


# Predictions

In [28]:
y_pred = model.predict(X_test)

# Model Evaluation

In [31]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score (Accuracy): {r2*100:.2f}")

Model Performance:
Mean Absolute Error (MAE): 2573.293906662222
Mean Squared Error (MSE): 17295666.86944151
Root Mean Squared Error (RMSE): 4158.805942748652
R² Score (Accuracy): 79.35


# Save Model

In [30]:
model_file = 'car_price_prediction_model.pkl'
joblib.dump(model, model_file)

print(f"Model saved to {model_file}")

Model saved to car_price_prediction_model.pkl
