In [2]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [4]:
 # Load the 2024 dataset
dataset = pd.read_csv("dataset.csv")

# Initial exploration
dataset.head()


Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [7]:

# Splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], 
                                                    dataset.iloc[:, -1], 
                                                    test_size = 0.3, 
                                                    random_state = 42)

In [9]:
# Data Preprocessing: Cleaning steps (dropping columns, filling missing values, converting types) as done previously
X_train = X_train.iloc[:, 1:]  # Removing 'Unnamed: 0' index column
X_test = X_test.iloc[:, 1:]

In [11]:
# Extract manufacturer from 'Name' column
make_train = X_train["Name"].str.split(" ", expand = True)
make_test = X_test["Name"].str.split(" ", expand = True)
X_train["Manufacturer"] = make_train[0]
X_test["Manufacturer"] = make_test[0]
X_train.drop("Name", axis = 1, inplace = True)
X_test.drop("Name", axis = 1, inplace = True)


In [13]:

# Removing the 'Location' column as it is irrelevant
X_train.drop("Location", axis = 1, inplace = True)
X_test.drop("Location", axis = 1, inplace = True)

In [15]:
# Handling 'Year' column to represent the car's age
curr_time = datetime.datetime.now()
X_train['Year'] = X_train['Year'].apply(lambda x : curr_time.year - x)
X_test['Year'] = X_test['Year'].apply(lambda x : curr_time.year - x)

In [17]:

# Handle categorical columns (Fuel_Type, Transmission, Owner_Type)
X_train = pd.get_dummies(X_train, columns=["Manufacturer", "Fuel_Type", "Transmission", "Owner_Type"], drop_first=True)
X_test = pd.get_dummies(X_test, columns=["Manufacturer", "Fuel_Type", "Transmission", "Owner_Type"], drop_first=True)

In [19]:
# Handling missing columns between train and test set
missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('dataset.csv')

# Inspect the dataset for anomalies
print(df.head())  # Ensure columns are correctly aligned and data types are valid

# Define a function to clean and extract Mileage values
def clean_Mileage(value):
    """
    Cleans and extracts numeric Mileage values from strings.
    Handles cases where Mileage is specified in 'kmpl' or 'km/kg'.
    Returns None for invalid or non-numeric entries.
    """
    try:
        if isinstance(value, str):  # Check if value is a string
            if 'kmpl' in value:
                return float(value.replace(' kmpl', ''))
            elif 'km/kg' in value:
                return float(value.replace(' km/kg', ''))
        return None  # Return None for non-matching or invalid values
    except Exception as e:
        print(f"Error processing value '{value}': {e}")
        return None

# Apply the cleaning function to the Mileage column
if 'Mileage' in df.columns:
    df['Mileage'] = df['Mileage'].apply(clean_Mileage)
else:
    raise ValueError("Mileage column not found in the dataset.")

# Drop rows with invalid or missing Mileage values
df = df.dropna(subset=['Mileage'])

# Ensure the target column is present
target_column = 'Price'  # Replace with your actual target column name
if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in the dataset. Check your dataset.")

# Separate features and target variable
X = df.drop(target_column, axis=1)  # Drop target column to isolate features
y = df[target_column]  # Target variable

# Handle categorical features (convert to numeric using one-hot encoding)
categorical_columns = X.select_dtypes(include=['object']).columns
if not categorical_columns.empty:
    X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the feature data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Confirm successful processing
print("Data preprocessing completed successfully!")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


   Unnamed: 0                              Name    Location  Year  \
0           0            Maruti Wagon R LXI CNG      Mumbai  2010   
1           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
2           2                      Honda Jazz V     Chennai  2011   
3           3                 Maruti Ertiga VDI     Chennai  2012   
4           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   

   Kilometers_Driven Fuel_Type Transmission Owner_Type     Mileage   Engine  \
0              72000       CNG       Manual      First  26.6 km/kg   998 CC   
1              41000    Diesel       Manual      First  19.67 kmpl  1582 CC   
2              46000    Petrol       Manual      First   18.2 kmpl  1199 CC   
3              87000    Diesel       Manual      First  20.77 kmpl  1248 CC   
4              40670    Diesel    Automatic     Second   15.2 kmpl  1968 CC   

       Power  Seats  New_Price  Price  
0  58.16 bhp    5.0        NaN   1.75  
1  126.2 bhp    5.0        NaN

# training model and evaluation

In [14]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

# Impute missing values in features
imputer = SimpleImputer(strategy='mean')  # Use mean to replace NaN values
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Linear Regression Model
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred_lr = linearRegression.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression R2 score: {r2_lr}")

# Random Forest Regressor Model
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest R2 score: {r2_rf}")



Linear Regression R2 score: -2.2284504132231393
Random Forest R2 score: -8.0748125


# model accurracy

In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib
from sklearn.model_selection import train_test_split

# Load the dataset
dataset = pd.read_csv("car predict 2025 assum.csv")  # Replace with the correct path

# Check for non-numeric data and inspect columns
print(dataset.head())  # View the first few rows to check the data

# Remove any non-numeric columns that aren't useful for prediction (e.g., 'Brand', 'Model')
dataset.drop(['Brand', 'Model'], axis=1, inplace=True)  # Drop 'Brand' and 'Model' columns

# Apply necessary preprocessing steps
dataset['Year'] = dataset['Year'].apply(lambda x: 2025 - x)  # Adjusting the year for prediction

# Handle Fuel_Type by encoding it (it’s a categorical feature)
make_train = dataset["Fuel_Type"].str.split(" ", expand=True)
dataset["Fuel_Type"] = make_train[0]  # Simplify or handle accordingly

# Apply one-hot encoding for categorical columns like 'Fuel_Type' and 'Transmission'
dataset = pd.get_dummies(dataset, columns=["Fuel_Type", "Transmission"], drop_first=True)

# Check again for non-numeric columns after transformation
print(dataset.head())  # View the dataset after encoding

# Define features (X) and target (y)
X = dataset.drop("Price", axis=1)  # All columns except 'Price'
y = dataset['Price']  # The target variable is 'Price'

# Ensure all columns are numeric
print(X.dtypes)  # Ensure no non-numeric columns are present

# Convert any remaining non-numeric columns if they exist (e.g., 'Fuel_Type_Petrol' or 'Transmission_Manual')
# This is usually handled by pd.get_dummies(), but if there are any unexpected string columns, handle them

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)

# Save the trained Random Forest model
joblib.dump(rf, 'rf_model.pkl')

# Save the StandardScaler used for scaling (if necessary)
standardScaler = StandardScaler()
X_train_scaled = standardScaler.fit_transform(X_train)
joblib.dump(standardScaler, 'scaler.pkl')

# Evaluate the model (optional)
print("Model accuracy on test set:", rf.score(X_test, y_test))

    Brand     Model  Year  Mileage Fuel_Type Transmission  Price
0  Toyota   Corolla  2025     15.5    Petrol    Automatic  25000
1   Honda     Civic  2025     12.0    Diesel       Manual  28000
2    Ford     Focus  2025     13.0    Petrol       Manual  23000
3     BMW  3 Series  2025     10.5    Diesel    Automatic  40000
4    Audi        A4  2025     11.5    Petrol    Automatic  35000
   Year  Mileage  Price  Fuel_Type_Petrol  Transmission_Manual
0     0     15.5  25000              True                False
1     0     12.0  28000             False                 True
2     0     13.0  23000              True                 True
3     0     10.5  40000             False                False
4     0     11.5  35000              True                False
Year                     int64
Mileage                float64
Fuel_Type_Petrol          bool
Transmission_Manual       bool
dtype: object
Model accuracy on test set: -9.803025


# convert to numeric value

In [48]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the dataset from CSV file
df = pd.read_csv('dataset.csv')

# Clean the 'Mileage' column: remove non-numeric characters (e.g., ' km/kg')
df['Mileage'] = df['Mileage'].replace(r'\D', '', regex=True).astype(float)
# Now you can apply StandardScaler
scaler = StandardScaler()

# Fit the scaler on the 'Mileage' data
scaler.fit(df[['Mileage']])

# If you want to scale the dataset for prediction, for example, for X_2024
X_2024 = df[['Mileage']]  # Replace with your actual dataset columns
X_2024_scaled = scaler.transform(X_2024)

# Print the scaled data
print(X_2024_scaled)
print(df)


[[-0.68556048]
 [ 1.29098365]
 [-0.78316759]
 ...
 [-0.83197115]
 [-0.77503367]
 [ 1.96145159]]
      Unnamed: 0                              Name    Location  Year  \
0              0            Maruti Wagon R LXI CNG      Mumbai  2010   
1              1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
2              2                      Honda Jazz V     Chennai  2011   
3              3                 Maruti Ertiga VDI     Chennai  2012   
4              4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
...          ...                               ...         ...   ...   
6014        6014                  Maruti Swift VDI       Delhi  2014   
6015        6015          Hyundai Xcent 1.1 CRDi S      Jaipur  2015   
6016        6016             Mahindra Xylo D4 BSIV      Jaipur  2012   
6017        6017                Maruti Wagon R VXI     Kolkata  2013   
6018        6018             Chevrolet Beat Diesel   Hyderabad  2011   

      Kilometers_Driven Fuel_Type Trans

In [40]:
from sklearn.ensemble import RandomForestRegressor

# Example: Train a Random Forest model
rf = RandomForestRegressor()
rf.fit(X_train, y_train)  # X_train, y_train are your training features and target labels

# Now, save the feature names
model_features = rf.feature_names_in_
joblib.dump(model_features, 'model_features.pkl')


['model_features.pkl']

# predict f0r 2025 


In [42]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Load the trained model and scaler
rf = joblib.load('rf_model.pkl')  # Load the Random Forest model
standardScaler = joblib.load('scaler.pkl')  # Load the StandardScaler if used

# Load the dataset for 2024 and 2025 (assuming you have datasets containing 2024 and 2025 data)
dataset_2024 = pd.read_csv("dataset.csv")  # Replace with actual path to the 2024 dataset
dataset_2025 = pd.read_csv("car predict 2025 assum.csv")  # Corrected filename

# Print column names to inspect the dataset
print("Columns in dataset_2024:", dataset_2024.columns)
print("Columns in dataset_2025:", dataset_2025.columns)

# Drop non-numeric columns like 'Brand' and 'Model' if they exist in both datasets
columns_to_drop = ['Brand', 'Model']
for col in columns_to_drop:
    if col in dataset_2024.columns:
        dataset_2024 = dataset_2024.drop(col, axis=1)  # Drop column and reassign to avoid chained assignment
    if col in dataset_2025.columns:
        dataset_2025 = dataset_2025.drop(col, axis=1)  # Drop column and reassign to avoid chained assignment

# Clean the 'Fuel_Efficiency' column (or any other similar column)
for col in ['Fuel_Efficiency']:  # Replace with the actual column name causing the issue
    if col in dataset_2024.columns:
        # Convert to string and clean, handling any non-numeric characters
        dataset_2024[col] = dataset_2024[col].astype(str).str.replace(r'[^0-9.]', '', regex=True)
        dataset_2024[col] = pd.to_numeric(dataset_2024[col], errors='coerce')
        dataset_2024[col] = dataset_2024[col].fillna(dataset_2024[col].mean())  # Fill NaNs with the mean value

    if col in dataset_2025.columns:
        dataset_2025[col] = dataset_2025[col].astype(str).str.replace(r'[^0-9.]', '', regex=True)
        dataset_2025[col] = pd.to_numeric(dataset_2025[col], errors='coerce')
        dataset_2025[col] = dataset_2025[col].fillna(dataset_2025[col].mean())  # Fill NaNs with the mean value

# Clean the 'Mileage' column in both datasets
for col in ['Mileage']:
    if col in dataset_2024.columns:
        dataset_2024[col] = dataset_2024[col].astype(str).str.replace(r'[^0-9.]', '', regex=True)
        dataset_2024[col] = pd.to_numeric(dataset_2024[col], errors='coerce')
        dataset_2024[col] = dataset_2024[col].fillna(dataset_2024[col].mean())  # Fill NaNs with the mean value
    if col in dataset_2025.columns:
        dataset_2025[col] = dataset_2025[col].astype(str).str.replace(r'[^0-9.]', '', regex=True)
        dataset_2025[col] = pd.to_numeric(dataset_2025[col], errors='coerce')
        dataset_2025[col] = dataset_2025[col].fillna(dataset_2025[col].mean())  # Fill NaNs with the mean value

# Adjust the 'Year' column for prediction
dataset_2024['Year'] = dataset_2024['Year'].apply(lambda x: 2025 - x)  # Adjusting the year for prediction
dataset_2025['Year'] = dataset_2025['Year'].apply(lambda x: 2025 - x)  # Adjusting the year for prediction

# Process 'Fuel_Type' column (split if needed)
for dataset in [dataset_2024, dataset_2025]:
    make_train = dataset["Fuel_Type"].str.split(" ", expand=True)
    dataset["Fuel_Type"] = make_train[0]

# Apply one-hot encoding for 'Fuel_Type' and 'Transmission' for both 2024 and 2025
dataset_2024 = pd.get_dummies(dataset_2024, columns=["Fuel_Type", "Transmission"], drop_first=True)
dataset_2025 = pd.get_dummies(dataset_2025, columns=["Fuel_Type", "Transmission"], drop_first=True)

# Ensure the columns are in the same order for both datasets
model_features = joblib.load('model_features.pkl')  # Load the saved feature names

# Align the datasets with the same feature columns
dataset_2024 = dataset_2024[model_features]
dataset_2025 = dataset_2025[model_features]

# Define features (X) for 2024 and 2025 data (without the target column 'Price')
X_2024 = dataset_2024
X_2025 = dataset_2025

# Scale the features using the pre-fitted scaler
X_2024_scaled = standardScaler.transform(X_2024)
X_2025_scaled = standardScaler.transform(X_2025)

# Ensure that X_2024 and X_2025 have feature names
X_2024_scaled = pd.DataFrame(X_2024_scaled, columns=X_2024.columns)
X_2025_scaled = pd.DataFrame(X_2025_scaled, columns=X_2025.columns)

# Predict the prices for the 2024 and 2025 data
predicted_prices_2024 = rf.predict(X_2024_scaled)
predicted_prices_2025 = rf.predict(X_2025_scaled)

# Compare predicted prices for both years
comparison_2024 = pd.DataFrame({
    'Year': [2024] * len(predicted_prices_2024),
    'Predicted Price': predicted_prices_2024
})

comparison_2025 = pd.DataFrame({
    'Year': [2025] * len(predicted_prices_2025),
    'Predicted Price': predicted_prices_2025
})

# Combine both years into one DataFrame for comparison
comparison = pd.concat([comparison_2024, comparison_2025])

# Save the comparison results to a CSV file
comparison.to_csv('price_comparison_2024_2025.csv', index=False)

# Display the comparison for both years
print("Price Comparison for 2024 and 2025:")
print(comparison.head())  # Display a few rows of the comparison


Columns in dataset_2024: Index(['Unnamed: 0', 'Name', 'Location', 'Year', 'Kilometers_Driven',
       'Fuel_Type', 'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power',
       'Seats', 'New_Price', 'Price'],
      dtype='object')
Columns in dataset_2025: Index(['Brand', 'Model', 'Year', 'Mileage', 'Fuel_Type', 'Transmission',
       'Price'],
      dtype='object')
Price Comparison for 2024 and 2025:
   Year  Predicted Price
0  2024          42560.0
1  2024          42560.0
2  2024          36070.0
3  2024          42560.0
4  2024          42640.0


# comparsion in price difference of 2024 and 2025 year

In [44]:
# Merge the two DataFrames using their index
merged_comparison = pd.merge(comparison_2024, comparison_2025, left_index=True, right_index=True, suffixes=('_2024', '_2025'))

# Calculate the difference in prices between 2025 and 2024
merged_comparison['Price Difference'] = merged_comparison['Predicted Price_2025'] - merged_comparison['Predicted Price_2024']

# Calculate the percentage rise in 2025 car price
merged_comparison['Percentage Rise'] = ((merged_comparison['Predicted Price_2025'] - merged_comparison['Predicted Price_2024']) / merged_comparison['Predicted Price_2024']) * 100

# Display the DataFrame with Percentage Rise
print(merged_comparison[['Predicted Price_2024', 'Predicted Price_2025', 'Price Difference', 'Percentage Rise']])
comparison.to_csv('difference in 2024 and 2025.csv', index=False)



   Predicted Price_2024  Predicted Price_2025  Price Difference  \
0               42560.0               36730.0           -5830.0   
1               42560.0               42560.0               0.0   
2               36070.0               36070.0               0.0   
3               42560.0               42640.0              80.0   
4               42640.0               36730.0           -5910.0   
5               42560.0               42640.0              80.0   
6               42560.0               36070.0           -6490.0   
7               42640.0               36730.0           -5910.0   
8               42560.0               42560.0               0.0   
9               42560.0               36730.0           -5830.0   

   Percentage Rise  
0       -13.698308  
1         0.000000  
2         0.000000  
3         0.187970  
4       -13.860225  
5         0.187970  
6       -15.249060  
7       -13.860225  
8         0.000000  
9       -13.698308  
