In [None]:
final_dataframe.info()


In [None]:
final_dataframe.dtypes

In [None]:
final_dataframe.describe()

In [None]:
# Identify identical columns using a try-except to handle unhashable types
identical_cols = []

for col in final_dataframe.columns:
    try:
        if final_dataframe[col].nunique() == 1:
            identical_cols.append(col)
    except TypeError:
        # Handle columns with unhashable types
        if final_dataframe[col].apply(lambda x: str(x)).nunique() == 1:
            identical_cols.append(col)

# Drop the identical columns
final_dataframe = final_dataframe.drop(columns=identical_cols)

print("Identical columns:", identical_cols)

In [None]:
num_columns = len(identical_cols)
print("Number of identical columns:", num_columns)

In [None]:
final_dataframe['specifications'].value_counts()

In [None]:
# Function to extract specification key value 
def extract_spec_value(specifications, key):
    # Loop through the list of dictionaries to find the matching key and return its value
    for spec in specifications:
        if spec['key'].lower() == key.lower():
            return spec['value']
    return None  # Return None if the key isn't found

# Now, extract 'Mileage', 'Engine', 'Max Power', and 'Cargo Volumn' from the 'specifications' column
final_dataframe['mileage'] = final_dataframe['specifications'].apply(lambda x: extract_spec_value(x, 'Mileage'))
final_dataframe['engine_capacity'] = final_dataframe['specifications'].apply(lambda x: extract_spec_value(x, 'Engine'))
final_dataframe['max_power'] = final_dataframe['specifications'].apply(lambda x: extract_spec_value(x, 'Max Power'))
final_dataframe['cargo_volumn'] = final_dataframe['specifications'].apply(lambda x: extract_spec_value(x, 'Cargo Volumn'))

# Check the result
final_dataframe[['mileage', 'engine_capacity', 'max_power', 'cargo_volumn']].info()


In [None]:
def clean_column(dataframe, column_name, unit_removal_chars=None, conversion_type=float):
    if unit_removal_chars:
        # Remove specified characters (like 'Kg', 'Kms', 'Lakh', etc.)
        dataframe[column_name] = dataframe[column_name].replace(unit_removal_chars, '', regex=True)
    
    # Convert to numeric, handling errors by coercing invalid parsing to NaN
    dataframe[column_name] = pd.to_numeric(dataframe[column_name], errors='coerce')
    
    return dataframe

# List of columns to clean and convert
columns_to_clean = {
    'kmDriven': ['Kms'], 
    'price': ['₹', ' Lakh'],  
    'kms_driven': ['Kms'],  
    'kerb_weight': ['Kg'],
    'gross_weight': ['Kg'],
    'top_speed': [' kmph'],
    'acceleration': [' Seconds'],
    'length': ['mm'],
    'width': ['mm'],
    'height': ['mm'],
    'wheel_base': ['mm'],
    'turning_radius': ['m'],
}

In [None]:
# Clean each specified column
for column, chars in columns_to_clean.items():
    final_dataframe = clean_column(final_dataframe, column, chars)
final_dataframe.drop(columns=['owner', 'ownership', 'engine_displacement','kms_driven'], inplace=True)

In [None]:
# Clean to retain only numeric values
final_dataframe['registration_year'] = final_dataframe['registration_year'].replace('[^0-9]', '', regex=True)
final_dataframe['registration_year'] = pd.to_numeric(final_dataframe['registration_year'], errors='coerce')
final_dataframe['seats'] = final_dataframe['seats'].replace('[^0-9]', '', regex=True)
final_dataframe['seats'] = pd.to_numeric(final_dataframe['seats'], errors='coerce')
final_dataframe['engine_type'] = final_dataframe['engine_type'].str.replace('Engine', '', regex=False, case=False)
final_dataframe['displacement'] = pd.to_numeric(final_dataframe['displacement'], errors='coerce')
final_dataframe['ground_clearance_unladen'] = final_dataframe['ground_clearance_unladen'].str.replace('mm', '', regex=False, case=False)
final_dataframe['cargo_volumn'] = final_dataframe['cargo_volumn'].replace('[^0-9]', '', regex=True)

In [None]:
# Update 'city' column based on specified patterns
city_mapping = {
    'bangalore_cars - bangalore_cars': 'bangalore',
    'delhi_cars - delhi_cars': 'delhi',
    'jaipur_cars - jaipur_cars': 'jaipur',
    'kolkata_cars - kolkata_cars': 'kolkata',
    'hyderabad_cars - hyderabad_cars': 'hyderabad',
    'chennai_cars - chennai_cars': 'chennai'
}
final_dataframe['city'] = final_dataframe['city'].replace(city_mapping)

In [None]:
# Dropping features with high missing values
final_dataframe.drop(columns=['gross_weight', 'compression_ratio', 'turning_radius', 'top_speed', 'borex_stroke', 'super_charger','fuel_type'], inplace=True)

In [None]:
features_to_drop = ['rto','model', 'color', 'value_configuration','fuel_suppy_system','registration_year','seats','gear_box','length','height','width',
                    'turbo_charger', 'front_brake_type', 'rear_brake_type', 'steering_type','ground_clearance_unladen','kerb_weight','alloy_wheel_size',
                    'tyre_type', 'cargo_volumn','fuel_suppy_system','turbo_charger','rear_tread','front_tread','acceleration','specifications','wheel_base',
                    'drive_type','no_door_numbers','steering_type','front_brake_type','rear_brake_type','values_per_cylinder','max_torque','engine_type']

final_dataframe = final_dataframe.drop(columns=features_to_drop)


In [None]:
# Remove unwanted text and convert to float
final_dataframe['mileage'] = final_dataframe['mileage'].str.replace(r'[^\d.]+', '', regex=True).astype(float)
final_dataframe['engine_capacity'] = final_dataframe['engine_capacity'].str.replace(r'[^\d.]+', '', regex=True).astype(float)
final_dataframe['max_power'] = final_dataframe['max_power'].str.replace(r'[^\d.]+', '', regex=True).astype(float)

In [None]:
# convert object columns to numerical
final_dataframe['seating_capacity'] = pd.to_numeric(final_dataframe['seating_capacity'], errors='coerce')
# Check the data types after conversion
print(final_dataframe.dtypes)

## Handling missing values

In [None]:
final_dataframe.isnull().sum()

In [None]:
final_dataframe['price'] = final_dataframe['price'].fillna(final_dataframe['price'].median())
final_dataframe['year_of_manufacture'] = final_dataframe['year_of_manufacture'].fillna(final_dataframe['year_of_manufacture'].median())
final_dataframe['displacement'] = final_dataframe['displacement'].fillna(final_dataframe['displacement'].median())
final_dataframe['insurance_validity'] = final_dataframe['insurance_validity'].fillna(final_dataframe['insurance_validity'].mode()[0])
final_dataframe['engine_capacity'] = final_dataframe['engine_capacity'].fillna(final_dataframe['engine_capacity'].mode()[0])
final_dataframe['seating_capacity'] = final_dataframe['seating_capacity'].fillna(final_dataframe['seating_capacity'].mode()[0])


In [None]:
# Handle missing values for 'mileage'
if final_dataframe['mileage'].skew() > 1:  # If highly skewed (positively skewed in this case)
    final_dataframe['mileage'] = final_dataframe['mileage'].fillna(final_dataframe['mileage'].median())
else:  # If not skewed, you can use mean
    final_dataframe['mileage'] = final_dataframe['mileage'].fillna(final_dataframe['mileage'].mean())


In [None]:
# Handle missing values for 'max_power'
if final_dataframe['max_power'].skew() > 1:  # If highly skewed
    final_dataframe['max_power'] = final_dataframe['max_power'].fillna(final_dataframe['max_power'].median())
else:  # If not skewed, you can use mean
    final_dataframe['max_power'] = final_dataframe['max_power'].fillna(final_dataframe['max_power'].mean())

In [None]:
# Handle missing values for 'no_of_cylinder' (use mode as it's categorical)
final_dataframe['no_of_cylinder'] = final_dataframe['no_of_cylinder'].fillna(final_dataframe['no_of_cylinder'].mode()[0])


## Label encoding

In [None]:
final_dataframe['oem'].unique()

In [None]:
### Replacing 'oem' values with ascending numbers
final_dataframe['oem'].replace(['Maruti', 'Ford', 'Tata', 'Hyundai', 'Jeep', 'Datsun', 'Honda',
                                'Mahindra', 'Mercedes-Benz', 'BMW', 'Renault', 'Audi', 'Toyota',
                                'Mini', 'Kia', 'Skoda', 'Volkswagen', 'Volvo', 'MG', 'Nissan',
                                'Fiat', 'Mahindra Ssangyong', 'Mitsubishi', 'Jaguar', 'Land Rover',
                                'Chevrolet', 'Citroen', 'Opel', 'Mahindra Renault', 'Isuzu',
                                'Lexus', 'Porsche', 'Hindustan Motors'],
                               [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,31,32], inplace=True)


In [None]:
final_dataframe['fuelType'].unique()

In [None]:
# Replacing 'fuelType' values with numbers starting from 0
final_dataframe['fuelType'].replace(['Petrol', 'Diesel', 'Lpg', 'Cng', 'Electric'], [0, 1, 2, 3, 4], inplace=True)


In [None]:
final_dataframe['bodyType'].unique()

In [None]:
# Replace missing or empty values with -1 and then perform label encoding
final_dataframe['bodyType'].replace(['Hatchback', 'SUV', 'Sedan', 'MUV', 'Coupe', 'Minivans',
                                     'Pickup Trucks', 'Convertibles', 'Hybrids', 'Wagon', ''], 
                                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1], inplace=True)


In [None]:
final_dataframe['transmission'].unique()

In [None]:
# Replace 'Manual' with 0 and 'Automatic' with 1 in the 'transmission' column
final_dataframe['transmission'].replace(['Manual', 'Automatic'], [0, 1], inplace=True)

In [None]:
final_dataframe['insurance_validity'].unique()

In [None]:
# Replace the insurance types with numeric values starting from 0
final_dataframe['insurance_validity'].replace(['Third Party insurance', 'Comprehensive', 'Third Party', 'Zero Dep', '2', '1', 'Not Available'],
                                              [0, 1, 2, 3, 4, 5, 6], inplace=True)

In [None]:
final_dataframe['city'].unique()

In [None]:
final_dataframe['city'].replace(['bangalore', 'chennai', 'delhi', 'hyderabad', 'jaipur', 'kolkata'],[0, 1, 2, 3, 4, 5], inplace=True)

## Handling outliers

In [None]:
def outlier(final_dataframe,column):
  q1= final_dataframe[column].quantile(0.25)
  q3= final_dataframe[column].quantile(0.75)

  iqr= q3-q1

  upper_threshold= q3 + (1.5*iqr)
  lower_threshold= q1 - (1.5*iqr)

  final_dataframe[column]= final_dataframe[column].clip(lower_threshold, upper_threshold)

## Log transformation

In [None]:
# Applying log transformation
final_dataframe["kmDriven"] = np.log1p(final_dataframe["kmDriven"])
final_dataframe["max_power"] = np.log1p(final_dataframe["max_power"])
final_dataframe["no_of_cylinder"] = np.log1p(final_dataframe["no_of_cylinder"])


## Model preparation

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

##train and test data split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
# Define models for training and cross-validation
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Lasso': Lasso(),
    'Ridge': Ridge()
}

## Cross validation

In [None]:
# Cross-validation and performance evaluation on training data
cv_results = {
    'Model': [],
    'Mean CV MAE': [],
    'Mean CV MSE': [],
    'Mean CV R²': []
}

# cross-validation for each model
for model_name, model in models.items():
    print(f"Evaluating {model_name} with cross-validation...")

    # cross-validation
    cv_mae = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
    cv_mse = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_r2 = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

    # Store the results
    cv_results['Model'].append(model_name)
    cv_results['Mean CV MAE'].append(np.mean(-cv_mae))
    cv_results['Mean CV MSE'].append(np.mean(-cv_mse))
    cv_results['Mean CV R²'].append(np.mean(cv_r2))

# Create a DataFrame to summarize cross-validation results
cv_results_df = pd.DataFrame(cv_results)
print("Cross-Validation Results:")
print(cv_results_df)

## Hyerparameter tunning

In [None]:
# Define hyperparameter grids for each model
param_grids = {
    'Linear Regression': {},
    'Decision Tree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]},
    
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]},
    
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 10]},
    
    'Lasso': {
        'alpha': [0.01, 0.1, 1]},
    
    'Ridge': {
        'alpha': [0.01, 0.1, 1]}}

In [None]:
# Hyperparameter tuning for each model
best_models = {}
for model_name, model in models.items():
    print(f"Tuning {model_name}...")
    
    # Perform grid search
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, n_jobs=-1, scoring='neg_mean_absolute_error')
    grid_search.fit(X_train, y_train)
    
    # Save the best model and parameters
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")

# Model Evaluation and Comparison
results = {
    'Model': [],
    'MAE': [],
    'MSE': [],
    'R²': []}


## Model evaluation

In [None]:
# Evaluate best models on the test set
for model_name, model in best_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Append results for comparison
    results['Model'].append(model_name)
    results['MAE'].append(mae)
    results['MSE'].append(mse)
    results['R²'].append(r2)


In [None]:
# DataFrame for easy comparison
results_df = pd.DataFrame(results)
print("Model Evaluation Results:")
print(results_df)

# Selecting the best model based on R² or lowest MAE/MSE
best_model = results_df.loc[results_df['R²'].idxmax()]  # Or idxmin() for MAE/MSE
print(f"Best Model: {best_model['Model']}")

In [None]:
import pickle
best_model = best_models['Gradient Boosting']
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Gradient Boosting model saved successfully!")