In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load your dataset (replace 'your_dataset.csv' with your actual file)
df = pd.read_csv("Downloads/used_car.csv")

In [3]:
# Identify duplicates
duplicates = df.duplicated()

# Remove duplicates
df_no_duplicates = df[~duplicates]

In [4]:
# Identify duplicates based on specific columns
duplicates = df.duplicated(subset=['car_name', 'car_price_in_rupees','kms_driven','fuel_type','city','year_of_manufacture'])

# Remove duplicates based on specific columns
df_no_duplicates = df[~duplicates]

In [10]:
# Identify missing values
missing_values = df.isnull().sum()

In [11]:
print("Missing Values:")
print(missing_values)

Missing Values:
car_name               0
car_price_in_rupees    0
kms_driven             0
fuel_type              0
city                   0
year_of_manufacture    0
dtype: int64


In [12]:
missing_percentage = (df.isnull().sum() / len(df)) * 100
print("\nMissing Percentage:")
print(missing_percentage)


Missing Percentage:
car_name               0.0
car_price_in_rupees    0.0
kms_driven             0.0
fuel_type              0.0
city                   0.0
year_of_manufacture    0.0
dtype: float64


In [13]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

In [16]:
# Apply label encoding to categorical columns
categorical_columns = ['fuel_type', 'city']
for column in categorical_columns:
    df[column + '_encoded'] = label_encoder.fit_transform(df[column])

In [17]:
# Display the DataFrame with encoded columns
print(df)

                                              car_name car_price_in_rupees  \
0    Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...         ₹ 4.45 Lakh   
1                           Maruti Suzuki Alto 800 Lxi         ₹ 2.93 Lakh   
2                              Tata Safari XZ Plus New        ₹ 22.49 Lakh   
3                              Maruti Suzuki Ciaz ZXI+         ₹ 6.95 Lakh   
4       Jeep Compass Sport Plus 1.4 Petrol [2019-2020]           ₹ 12 Lakh   
..                                                 ...                 ...   
941              Hyundai Verna Fluidic 1.6 VTVT SX Opt         ₹ 6.49 Lakh   
942                 Ford EcoSport Ambiente 1.5L Ti-VCT         ₹ 6.25 Lakh   
943     Land Rover Discovery Sport SE R-Dynamic Petrol           ₹ 64 Lakh   
944                         Ford Figo Titanium1.5 TDCi          ₹ 3.6 Lakh   
945                         Maruti Suzuki Ciaz VXi+ AT         ₹ 6.75 Lakh   

    kms_driven fuel_type       city  year_of_manufacture  fuel_

In [18]:
def convert_price_to_numeric(price_str):
    # Remove '₹', commas, and split to get value and unit
    value_str = price_str[1:].replace(',', '')
    value = float(value_str.split()[0])

    if 'lakh' in price_str.lower():
        return value * 100000
    elif 'crore' in price_str.lower():
        return value * 10000000
    elif 'thousand' in price_str.lower():
        return value * 1000
    else:
        return value

In [19]:
df['numeric_price'] = df['car_price_in_rupees'].apply(convert_price_to_numeric)

In [20]:
df

Unnamed: 0,car_name,car_price_in_rupees,kms_driven,fuel_type,city,year_of_manufacture,fuel_type_encoded,city_encoded,numeric_price
0,Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...,₹ 4.45 Lakh,"22,402 km",Petrol,Mumbai,2016,6,9,445000.0
1,Maruti Suzuki Alto 800 Lxi,₹ 2.93 Lakh,"10,344 km",Petrol,Kolkata,2019,6,8,293000.0
2,Tata Safari XZ Plus New,₹ 22.49 Lakh,"12,999 km",Diesel,Bangalore,2021,1,2,2249000.0
3,Maruti Suzuki Ciaz ZXI+,₹ 6.95 Lakh,"45,000 km",Petrol,Thane,2016,6,14,695000.0
4,Jeep Compass Sport Plus 1.4 Petrol [2019-2020],₹ 12 Lakh,"11,193 km",Petrol,Kolkata,2019,6,8,1200000.0
...,...,...,...,...,...,...,...,...,...
941,Hyundai Verna Fluidic 1.6 VTVT SX Opt,₹ 6.49 Lakh,"40,493 km",Petrol,Bangalore,2015,6,2,649000.0
942,Ford EcoSport Ambiente 1.5L Ti-VCT,₹ 6.25 Lakh,"30,723 km",Petrol,Noida,2017,6,10,625000.0
943,Land Rover Discovery Sport SE R-Dynamic Petrol,₹ 64 Lakh,"8,500 km",Petrol,Ahmedabad,2021,6,0,6400000.0
944,Ford Figo Titanium1.5 TDCi,₹ 3.6 Lakh,"42,158 km",Diesel,Kolkata,2015,1,8,360000.0


In [21]:
df = df.drop('car_price_in_rupees', axis=1)

In [22]:
print(df)

                                              car_name kms_driven fuel_type  \
0    Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...  22,402 km    Petrol   
1                           Maruti Suzuki Alto 800 Lxi  10,344 km    Petrol   
2                              Tata Safari XZ Plus New  12,999 km    Diesel   
3                              Maruti Suzuki Ciaz ZXI+  45,000 km    Petrol   
4       Jeep Compass Sport Plus 1.4 Petrol [2019-2020]  11,193 km    Petrol   
..                                                 ...        ...       ...   
941              Hyundai Verna Fluidic 1.6 VTVT SX Opt  40,493 km    Petrol   
942                 Ford EcoSport Ambiente 1.5L Ti-VCT  30,723 km    Petrol   
943     Land Rover Discovery Sport SE R-Dynamic Petrol   8,500 km    Petrol   
944                         Ford Figo Titanium1.5 TDCi  42,158 km    Diesel   
945                         Maruti Suzuki Ciaz VXi+ AT  64,726 km    Petrol   

          city  year_of_manufacture  fuel_type_enco

In [23]:
df['kms_driven'] = df['kms_driven'].replace('[^\d.]', '', regex=True).astype(float)

In [24]:
print(df)

                                              car_name  kms_driven fuel_type  \
0    Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...     22402.0    Petrol   
1                           Maruti Suzuki Alto 800 Lxi     10344.0    Petrol   
2                              Tata Safari XZ Plus New     12999.0    Diesel   
3                              Maruti Suzuki Ciaz ZXI+     45000.0    Petrol   
4       Jeep Compass Sport Plus 1.4 Petrol [2019-2020]     11193.0    Petrol   
..                                                 ...         ...       ...   
941              Hyundai Verna Fluidic 1.6 VTVT SX Opt     40493.0    Petrol   
942                 Ford EcoSport Ambiente 1.5L Ti-VCT     30723.0    Petrol   
943     Land Rover Discovery Sport SE R-Dynamic Petrol      8500.0    Petrol   
944                         Ford Figo Titanium1.5 TDCi     42158.0    Diesel   
945                         Maruti Suzuki Ciaz VXi+ AT     64726.0    Petrol   

          city  year_of_manufacture  fu

In [25]:
# Save the updated DataFrame to a new CSV file
new_file_path = 'Downloads/output_file.csv'
df.to_csv(new_file_path, index=False)

In [26]:
print(df)

                                              car_name  kms_driven fuel_type  \
0    Hyundai Grand i10 Magna 1.2 Kappa VTVT [2017-2...     22402.0    Petrol   
1                           Maruti Suzuki Alto 800 Lxi     10344.0    Petrol   
2                              Tata Safari XZ Plus New     12999.0    Diesel   
3                              Maruti Suzuki Ciaz ZXI+     45000.0    Petrol   
4       Jeep Compass Sport Plus 1.4 Petrol [2019-2020]     11193.0    Petrol   
..                                                 ...         ...       ...   
941              Hyundai Verna Fluidic 1.6 VTVT SX Opt     40493.0    Petrol   
942                 Ford EcoSport Ambiente 1.5L Ti-VCT     30723.0    Petrol   
943     Land Rover Discovery Sport SE R-Dynamic Petrol      8500.0    Petrol   
944                         Ford Figo Titanium1.5 TDCi     42158.0    Diesel   
945                         Maruti Suzuki Ciaz VXi+ AT     64726.0    Petrol   

          city  year_of_manufacture  fu

In [27]:
# Drop non-numeric columns and the target variable
X = df.drop(['car_name', 'fuel_type', 'city', 'numeric_price'], axis=1)
y = df['numeric_price']

In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
y_pred_linear = linear_model.predict(X_test_scaled)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

In [31]:
# Decision Trees
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train_scaled, y_train)
y_pred_tree = tree_model.predict(X_test_scaled)
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

In [32]:
# Random Forest
forest_model = RandomForestRegressor(random_state=42)
forest_model.fit(X_train_scaled, y_train)
y_pred_forest = forest_model.predict(X_test_scaled)
mse_forest = mean_squared_error(y_test, y_pred_forest)
r2_forest = r2_score(y_test, y_pred_forest)

In [33]:
# Support Vector Regression
svr_model = SVR()
svr_model.fit(X_train_scaled, y_train)
y_pred_svr = svr_model.predict(X_test_scaled)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

In [34]:
# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

In [35]:
# Lasso Regression
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

In [36]:
# K-Nearest Neighbors
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

In [37]:
# Display results
results_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Trees', 'Random Forest', 'SVR', 'Ridge', 'Lasso', 'KNN'],
    'Mean Squared Error': [mse_linear, mse_tree, mse_forest, mse_svr, mse_ridge, mse_lasso, mse_knn],
    'R-squared': [r2_linear, r2_tree, r2_forest, r2_svr, r2_ridge, r2_lasso, r2_knn]
})

In [38]:
print("Results for Regression Models:")
print(results_df)

Results for Regression Models:
               Model  Mean Squared Error  R-squared
0  Linear Regression        2.106493e+12   0.168031
1     Decision Trees        6.341971e+12  -1.504791
2      Random Forest        3.114618e+12  -0.230133
3                SVR        3.085796e+12  -0.218750
4              Ridge        2.106673e+12   0.167959
5              Lasso        2.106493e+12   0.168031
6                KNN        2.401369e+12   0.051568


In [39]:
# Create DataFrames to compare actual vs. predicted prices
comparison_linear_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (Linear Regression)': y_pred_linear})
comparison_tree_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (Decision Trees)': y_pred_tree})
comparison_forest_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (Random Forest)': y_pred_forest})
comparison_svr_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (SVR)': y_pred_svr})
comparison_ridge_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (Ridge)': y_pred_ridge})
comparison_lasso_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (Lasso)': y_pred_lasso})
comparison_knn_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price (KNN)': y_pred_knn})

In [40]:
# Display comparison DataFrames
print("\nComparison for Linear Regression:")
print(comparison_linear_df)

print("\nComparison for Decision Trees:")
print(comparison_tree_df)

print("\nComparison for Random Forest:")
print(comparison_forest_df)

print("\nComparison for SVR:")
print(comparison_svr_df)

print("\nComparison for Ridge Regression:")
print(comparison_ridge_df)

print("\nComparison for Lasso Regression:")
print(comparison_lasso_df)

print("\nComparison for KNN Regression:")
print(comparison_knn_df)


Comparison for Linear Regression:
     Actual Price  Predicted Price (Linear Regression)
327      695000.0                         1.251050e+06
30       775000.0                         2.006428e+06
820      700000.0                         1.448911e+06
404      295000.0                         3.938096e+05
76      1125000.0                         1.163802e+06
..            ...                                  ...
873     1500000.0                         1.045734e+06
567      475000.0                         1.711464e+06
542     1682000.0                         1.523517e+06
377      370000.0                         4.374900e+05
109      875000.0                         1.417681e+06

[190 rows x 2 columns]

Comparison for Decision Trees:
     Actual Price  Predicted Price (Decision Trees)
327      695000.0                          895000.0
30       775000.0                         1045000.0
820      700000.0                          914000.0
404      295000.0                        