In [13]:
import pandas as pd
import numpy as np

In [4]:
np.random.seed(42)

In [5]:
n_samples = 10000

In [6]:
data = {
    'Make': np.random.choice(['Toyota', 'Ford', 'BMW', 'Honda', 'Audi'], n_samples),
    'Model': np.random.choice(['Model A', 'Model B', 'Model C', 'Model D', 'Model E'], n_samples),
    'Year': np.random.choice(range(2000, 2022), n_samples),
    'Engine Size': np.round(np.random.uniform(1.0, 4.5, n_samples), 1),
    'Mileage': np.random.randint(0, 200000, n_samples),
    'Fuel Type': np.random.choice(['Petrol', 'Diesel', 'Electric'], n_samples),
    'Transmission': np.random.choice(['Automatic', 'Manual'], n_samples),
}

In [7]:
data['Price'] = (
    20000 +
    (2022 - data['Year']) * -500 +  # Older cars are cheaper
    data['Engine Size'] * 2000 +     # Bigger engines are more expensive
    (200000 - data['Mileage']) * 0.05  # Lower mileage increases the price
    + np.random.normal(0, 2000, n_samples)  # Add some noise
)

In [9]:
df = pd.DataFrame(data)

In [11]:
df.to_csv('car_sales_data.csv', index=False)

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

ds = load_dataset("VarunKumarGupta2003/Car-Price-Dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

car_sales_data.csv:   0%|          | 0.00/636k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
ds

DatasetDict({
    train: Dataset({
        features: ['Make', 'Model', 'Year', 'Engine Size', 'Mileage', 'Fuel Type', 'Transmission', 'Price'],
        num_rows: 10000
    })
})

In [14]:
df = pd.DataFrame(ds['train'])

In [15]:
df

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015,1.3,156577,Diesel,Manual,19240.996903
1,Audi,Model D,2017,1.1,192529,Electric,Manual,20281.779257
2,BMW,Model E,2000,3.7,156962,Electric,Automatic,19178.745634
3,Audi,Model D,2007,2.9,183057,Petrol,Manual,22090.033440
4,Audi,Model D,2000,3.4,108269,Petrol,Manual,21124.237648
...,...,...,...,...,...,...,...,...
9995,Ford,Model D,2008,2.4,46784,Electric,Manual,26642.475776
9996,BMW,Model D,2002,3.2,6621,Petrol,Manual,27486.488777
9997,Ford,Model B,2011,3.7,61037,Diesel,Manual,27132.213762
9998,Honda,Model B,2019,3.1,190175,Electric,Manual,23925.403607


In [16]:
X = df.drop('Price',axis = 1)
y = df['Price']

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
categorical_features = ['Make', 'Model', 'Fuel Type', 'Transmission']
numeric_features = ['Year', 'Engine Size', 'Mileage']

In [29]:

# Preprocessing: Define a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Transform the training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [33]:
new = pd.DataFrame(X_train_preprocessed,columns = preprocessor.get_feature_names_out())
new

Unnamed: 0,num__Year,num__Engine Size,num__Mileage,cat__Make_Audi,cat__Make_BMW,cat__Make_Ford,cat__Make_Honda,cat__Make_Toyota,cat__Model_Model A,cat__Model_Model B,cat__Model_Model C,cat__Model_Model D,cat__Model_Model E,cat__Fuel Type_Diesel,cat__Fuel Type_Electric,cat__Fuel Type_Petrol,cat__Transmission_Automatic,cat__Transmission_Manual
0,1.016739,0.033763,0.974850,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-0.879904,1.320916,0.470128,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,-0.089636,1.419927,-0.343907,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,1.174793,1.419927,-1.632382,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.068417,1.122892,1.388986,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.542578,-0.362284,-1.224830,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
7996,0.858686,1.419927,-1.419791,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
7997,-1.037958,-0.659319,1.450480,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7998,-1.512119,0.726845,0.543600,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [34]:
from sklearn.linear_model import *
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR

# Create and train the model
model = LinearRegression()
model.fit(X_train_preprocessed, y_train)

In [35]:
y_pred = model.predict(X_test_preprocessed)

In [37]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 3980100.6966516245
R^2 Score: 0.850752246107692


In [38]:
import numpy as np

# Define the input data
input_data = {
    'Make': ['Toyota'],
    'Model': ['Model C'],
    'Year': [2015],
    'Engine Size': [2.5],
    'Mileage': [50000],
    'Fuel Type': ['Petrol'],
    'Transmission': ['Automatic']
}

In [40]:
input_data

{'Make': ['Toyota'],
 'Model': ['Model C'],
 'Year': [2015],
 'Engine Size': [2.5],
 'Mileage': [50000],
 'Fuel Type': ['Petrol'],
 'Transmission': ['Automatic']}

In [43]:
input_df = pd.DataFrame(input_data)
input_df

Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission
0,Toyota,Model C,2015,2.5,50000,Petrol,Automatic


In [44]:
input_preprocessed = preprocessor.transform(input_df)
input_preprocessed

array([[ 0.70063206, -0.26327221, -0.84611236,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  1.        ,  0.        ]])

In [45]:
predicted_price = model.predict(input_preprocessed)

# Display the predicted price
print(f"Predicted Price: ${predicted_price[0]:,.2f}")

Predicted Price: $29,154.69


In [46]:
import joblib
joblib.dump(model, 'Car-Price-Prediction_model.joblib')

['Car-Price-Prediction_model.joblib']

In [47]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [48]:
from huggingface_hub import hf_hub_download
import joblib

# Ensure the repository ID and file name are correct
repo_id = "VarunKumarGupta2003/Car-Sale-Prediction"  # Verify this is the correct repo ID
filename = "Car-Price-Prediction_model.joblib"  # Check if the file name is correct

# Download the model file
Model = joblib.load(hf_hub_download(repo_id, filename))
# The rest of your code remains the same

Car-Price-Prediction_model.joblib:   0%|          | 0.00/840 [00:00<?, ?B/s]

In [49]:
Model

In [50]:
predicted_price = Model.predict(input_preprocessed)

# Display the predicted price
print(f"Predicted Price: ${predicted_price[0]:,.2f}")

Predicted Price: $29,154.69
