# **TASK : CAR PRICE Predictor with ML**

Importing required Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

Loading the data from the dataset

In [2]:
file_path = 'quikr_car.csv'  # Change this to your file path
data = pd.read_csv(file_path)

Data Cleaning for missing and incorrect format values in the datset

In [3]:
# Drop rows with missing target values (Price)
data = data.dropna(subset=['Price'])

# Fill missing values or drop rows with too many missing values
data = data.dropna()  # You can use different strategies to fill missing values if needed

# Remove incorrect data (e.g., non-numeric values in numeric columns)
data = data[data['Price'].apply(lambda x: str(x).replace(',', '').isdigit())]
data['Price'] = data['Price'].apply(lambda x: int(str(x).replace(',', '')))

Preprocessing the data for features and target variables

In [4]:
# Define features and target variable
X = data.drop(['Price'], axis=1)
y = data['Price']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['number']).columns.tolist()

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create and evaluate the pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

Splitting the data into training and testing sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Fitting and Predicting the Model

In [6]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Evaluating the Model

In [7]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 467827221959.1184
R^2 Score: 0.16338014842562554
