In [21]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv(r"E:\Price_Prediction\data\car_sales_data.csv")

In [None]:
df.head()

In [None]:
plt.scatter(df['Mileage'], df['Price'])
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()

In [None]:
X = df[['Mileage']] 
y = df['Price']

print(X)
print(y)

In [None]:
#Data Cleaning and Outlier Handling

df['Mileage'] = pd.to_numeric(df['Mileage'].astype(str).str.replace(',', ''), errors='coerce')
df['Price']   = pd.to_numeric(df['Price'].astype(str).str.replace(',', ''), errors='coerce')

df.dropna(subset=['Mileage', 'Price'], inplace=True)
df.reset_index(drop=True, inplace=True)
print("Data types converted and missing values dropped.")


# Using the IQR method to remove outliers 
print(f"\nShape before outlier removal: {df.shape}")

Q1 = df[['Mileage', 'Price']].quantile(0.25)
Q3 = df[['Mileage', 'Price']].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_cleaned = df[~((df[['Mileage', 'Price']] < lower_bound) | (df[['Mileage', 'Price']] > upper_bound)).any(axis=1)].copy()

print(f"Shape after outlier removal: {df_cleaned.shape}")


#Updating X and y variables to use the final cleaned data

X = df_cleaned[['Mileage']]
y = df_cleaned['Price']

print("\nX and y variables have been updated with cleaned data.")

In [None]:
# Inspect cleaned data
df.info()
df.describe()

In [24]:
#Splitting into train and test
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print(f"Training set contains {X_train.shape[0]} samples.")
print(f"Testing set contains {X_test.shape[0]} samples.")

In [None]:
# Apply the Logarithmic Transformation to the target variable
y_train_log = np.log1p(y_train)

# Visualizing the transformed relationship
plt.figure(figsize=(8, 5))
plt.scatter(X_train, y_train_log)
plt.title('Transformed Data: Mileage vs. Log of Price')
plt.xlabel('Mileage')
plt.ylabel('Log of Price')
plt.show()

In [27]:
# Training a Linear Regression model on the transformed data
from sklearn.linear_model import LinearRegression

In [None]:
log_model = LinearRegression()
log_model.fit(X_train, y_train_log)

print("Linear Regression model has been trained on the log-transformed data.")

In [None]:
#Predict on the test set and convert back to original scale [log->exp]

log_predictions = log_model.predict(X_test)

price_predictions = np.expm1(log_predictions)

print("Predictions made on the test set and converted back to the original price scale.")
print("\nFirst 5 Predicted Prices:")
print(price_predictions[:5])

In [31]:
#Evaluate the model using standard metrics
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
mse = mean_squared_error(y_test, price_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, price_predictions)

print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print(f"R2 Score: {r2:.3f}")

In [None]:
#Plot actual vs predicted and the logarithmic regression curve
plt.figure(figsize=(10, 6))
plt.scatter(X_test, y_test, label='Actual Test Data', s=20, alpha=0.6)

X_range = pd.DataFrame(np.linspace(X['Mileage'].min(), X['Mileage'].max(), 200), columns=['Mileage'])
log_range_pred = log_model.predict(X_range)
price_range_pred = np.expm1(log_range_pred)

plt.plot(X_range, price_range_pred, color='red', linewidth=3, label='Logarithmic Regression Curve')

plt.title('Logarithmic Regression Model Fit')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
#Predict the price for a new mileage value

new_mileage = float(input("Enter the mileage of the car: "))

new_car_data = pd.DataFrame({'Mileage': [new_mileage]})

#Predicting the price in the log scale
log_price_prediction = log_model.predict(new_car_data)

#Converting the prediction back to the original price scale
predicted_price = np.expm1(log_price_prediction)

print(f"The predicted price for a car with {new_mileage} mileage is: ${predicted_price[0]:,.2f}")

In [36]:
import joblib

In [None]:
joblib.dump(log_model, 'model.pkl')
print("Model saved as model.pkl")