In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('All_cities_cleaned_data.csv', low_memory=False)

In [3]:
# Model Development - Train-Test Split

from sklearn.model_selection import train_test_split

# Define feature matrix (X) and target variable (y)
X = df.drop('Price', axis=1)  # Replace 'target_column' with the name of your target variable
y = df['Price']  # Replace 'target_column' with the name of your target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.preprocessing import OneHotEncoder

# Example of using pd.get_dummies
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Ensure that the dummy columns are aligned between train and test sets
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


In [5]:
# Check the data types of your features
print(X_train.dtypes)


Fuel type                           object
Body type                           object
Transmission                        object
Original Equipment Manufacturer     object
Model                               object
Model Year                           int64
Seats                                int64
Ownership                           object
Mileage                            float64
Color                               object
Tyre Type                           object
dtype: object


In [6]:
# # Model Selection and Training

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define which columns are categorical
categorical_features = ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 
                        'Model', 'Ownership', 'Color', 'Tyre Type']

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough',
    force_int_remainder_cols=False  # Suppress FutureWarning
)

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor())])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Print model evaluation metrics

print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")


MAE: 0.00289009934123204
MSE: 2.668734179692185e-05
R2: 0.9104554485889842


In [7]:
# Check for missing values
print(X_train.isnull().sum())
print(y_train.isnull().sum())


Fuel type                          0
Body type                          0
Transmission                       0
Original Equipment Manufacturer    0
Model                              0
Model Year                         0
Seats                              0
Ownership                          0
Mileage                            0
Color                              0
Tyre Type                          0
dtype: int64
0


In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define categorical features
categorical_features = ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 
                        'Model', 'Ownership', 'Color', 'Tyre Type']

# Update OneHotEncoder to handle unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Define the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor())])

# Define the parameter grid
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [10, 20, 30]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)

# Output the best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Get the best model
best_model = grid_search.best_estimator_


Best parameters: {'model__max_depth': 20, 'model__n_estimators': 200}


In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define categorical features
categorical_features = ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 
                        'Model', 'Ownership', 'Color', 'Tyre Type']

# Create the ColumnTransformer with OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Define the pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor())])

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'model__n_estimators': randint(50, 201),  # Randomly choose between 50 and 200 trees
    'model__max_depth': randint(10, 31)       # Randomly choose between 10 and 30 for the maximum depth of trees
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled
    cv=3,       # Number of cross-validation folds
    random_state=42,  # Seed for reproducibility
    n_jobs=-1   # Use all available cores
)

# Perform the search
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Print the best parameters and score
print(f"Best parameters: {random_search.best_params_}")
print(f"Best score: {random_search.best_score_}")

# Make predictions with the best model
y_pred_best = best_model.predict(X_test)

# Print evaluation metrics
print(f"MAE: {mean_absolute_error(y_test, y_pred_best)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_best)}")
print(f"R2: {r2_score(y_test, y_pred_best)}")


Best parameters: {'model__max_depth': 24, 'model__n_estimators': 156}
Best score: 0.870670104122695
MAE: 0.002880930849305466
MSE: 2.698276591127428e-05
R2: 0.9094642063739676


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7946 entries, 0 to 7945
Data columns (total 12 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Fuel type                        7946 non-null   object 
 1   Body type                        7946 non-null   object 
 2   Transmission                     7946 non-null   object 
 3   Original Equipment Manufacturer  7946 non-null   object 
 4   Model                            7946 non-null   object 
 5   Model Year                       7946 non-null   int64  
 6   Price                            7946 non-null   float64
 7   Seats                            7946 non-null   int64  
 8   Ownership                        7946 non-null   object 
 9   Mileage                          7946 non-null   float64
 10  Color                            7946 non-null   object 
 11  Tyre Type                        7946 non-null   object 
dtypes: float64(2), int64

In [None]:
# Don't run below code

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd

# Example feature lists
numerical_features = ['Model Year', 'Price', 'Seats', 'Mileage', 'Vehicle_Age']
categorical_features = ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 
                        'Model', 'Ownership', 'Color', 'Tyre Type']

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Fit the ColumnTransformer
preprocessor.fit(df)

# Retrieve and print the feature names after transformation
feature_names = preprocessor.get_feature_names_out()
print("Feature names after transformation:", feature_names)


In [12]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Print column names to debug
print("Original Columns in the DataFrame:", df.columns.tolist())

# Clean column names
df.columns = df.columns.str.strip()
print("Cleaned Columns in the DataFrame:", df.columns.tolist())

# Define feature columns
categorical_features = ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 
                        'Model', 'Ownership', 'Color', 'Tyre Type']
numerical_features = ['Model Year', 'Seats', 'Mileage', 'Vehicle_Age']

# Ensure all feature columns exist in DataFrame
missing_categorical_features = [col for col in categorical_features if col not in df.columns]
missing_numerical_features = [col for col in numerical_features if col not in df.columns]
if missing_categorical_features:
    print("Missing categorical features:", missing_categorical_features)
if missing_numerical_features:
    print("Missing numerical features:", missing_numerical_features)

# Remove missing features from the list
numerical_features = [col for col in numerical_features if col in df.columns]
categorical_features = [col for col in categorical_features if col in df.columns]

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Ensure 'Price' column exists
if 'Price' not in df.columns:
    raise ValueError("'Price' column is missing from the DataFrame")

# Prepare data
X = df.drop('Price', axis=1)
y = df['Price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_preprocessed, y_train)
y_pred_ridge = ridge_model.predict(X_test_preprocessed)

print("Ridge Regression Results:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_ridge)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_ridge)}")
print(f"R2: {r2_score(y_test, y_pred_ridge)}")

# Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_preprocessed, y_train)
y_pred_lasso = lasso_model.predict(X_test_preprocessed)

print("Lasso Regression Results:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lasso)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lasso)}")
print(f"R2: {r2_score(y_test, y_pred_lasso)}")

# Retrieve and print the feature names after transformation
feature_names = preprocessor.get_feature_names_out()
print("Feature names after transformation:", feature_names)


Original Columns in the DataFrame: ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 'Model', 'Model Year', 'Price', 'Seats', 'Ownership', 'Mileage', 'Color', 'Tyre Type']
Cleaned Columns in the DataFrame: ['Fuel type', 'Body type', 'Transmission', 'Original Equipment Manufacturer', 'Model', 'Model Year', 'Price', 'Seats', 'Ownership', 'Mileage', 'Color', 'Tyre Type']
Missing numerical features: ['Vehicle_Age']
Ridge Regression Results:
MAE: 0.003941835881965429
MSE: 4.76209275854306e-05
R2: 0.8402165854185715
Lasso Regression Results:
MAE: 0.011225171023547899
MSE: 0.0002983683898183758
R2: -0.0011211991370059238
Feature names after transformation: ['num__Model Year' 'num__Seats' 'num__Mileage' 'cat__Fuel type_Cng'
 'cat__Fuel type_Diesel' 'cat__Fuel type_Electric' 'cat__Fuel type_Lpg'
 'cat__Fuel type_Petrol' 'cat__Body type_Convertibles'
 'cat__Body type_Coupe' 'cat__Body type_Hatchback' 'cat__Body type_MUV'
 'cat__Body type_Minivans' 'cat__Body type_Pick

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
import joblib

# Assuming 'categorical_features' is a list of the categorical columns in X_train
categorical_features = ['Fuel type', 'Body type', 'Transmission', 
                        'Original Equipment Manufacturer', 'Model', 
                        'Ownership', 'Color', 'Tyre Type']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', StandardScaler(), ['Model Year', 'Seats', 'Mileage'])
    ])

# Create a pipeline that first transforms the data, then fits the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Fit the pipeline on the training data
model_pipeline.fit(X_train, y_train)

# Save the pipeline (which includes both the preprocessor and the model)
joblib.dump(model_pipeline, 'best_model.pkl')


['best_model.pkl']

In [16]:
X_train_preprocessed = preprocessor.transform(X_train)
model = RandomForestRegressor()
model.fit(X_train_preprocessed, y_train)
joblib.dump(model, 'best_model.pkl')


['best_model.pkl']

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib

# Define categorical and numerical features
categorical_features = ['Fuel type', 'Body type', 'Transmission', 
                        'Original Equipment Manufacturer', 'Model', 
                        'Ownership', 'Color', 'Tyre Type']
numerical_features = ['Model Year', 'Seats', 'Mileage']

# Create a ColumnTransformer with handle_unknown='ignore' for OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on training data and save it
preprocessor.fit(X_train)
joblib.dump(preprocessor, 'preprocessor.pkl')


['preprocessor.pkl']

In [29]:
# Convert 'Price' to lakhs and remove decimals
df['Price'] = (df['Price'] * 100).astype(int)


In [33]:

# Format 'Price in Lakhs' with commas for thousands separators
df['Price in Lakhs'] = df['Price'].apply(lambda x: f"{x:,.2f}")


In [31]:
# Convert 'Mileage' to integers by truncating decimals
df['Mileage'] = (df['Mileage'] * 1000000).astype(int)


In [34]:
# Save the cleaned DataFrame to a CSV file

df.to_csv('Final_data.csv', index=False)