In [1]:
import pandas as pd

# List of your processed file paths
file_paths = [
    r"C:\Users\The Best\Desktop\Projects\Car Dheko\bangalore_preprocessed.csv",
    r"C:\Users\The Best\Desktop\Projects\Car Dheko\chennai_preprocessed.csv",
    r"C:\Users\The Best\Desktop\Projects\Car Dheko\delhi_preprocessed.csv",
    r"C:\Users\The Best\Desktop\Projects\Car Dheko\hyderabad_preprocessed.csv",
    r"C:\Users\The Best\Desktop\Projects\Car Dheko\jaipur_preprocessed.csv",
    r"C:\Users\The Best\Desktop\Projects\Car Dheko\kolkata_preprocessed.csv"
]

# Initialize an empty list to hold the DataFrames
dataframes = []

# Loop through each file and read it into a DataFrame
for file in file_paths:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(dataframes, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv("merged_car.csv", index=False)



In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from scipy import stats

# Load the dataset
df = pd.read_csv(r"C:\Users\The Best\Desktop\Projects\Car Dheko\VS code\merged_car.csv")

# 1. Handling Missing Values

# For numerical columns, use mean imputation
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# For categorical columns, use mode imputation or create a new category for missing values
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])  # Updated to avoid FutureWarning

# 2. Standardising Data Formats

# Example: Removing units and commas from 'kms_driven'
if 'kms_driven' in df.columns:
    df['kms_driven'] = df['kms_driven'].str.replace(' Kms', '', regex=False)  # Remove ' Kms'
    df['kms_driven'] = df['kms_driven'].str.replace(',', '', regex=False)   # Remove commas
    df['kms_driven'] = pd.to_numeric(df['kms_driven'], errors='coerce')     # Convert to numeric, coerce errors to NaN

# Ensure all columns are of the correct data type
print("Data Types of Columns:")
print(df.dtypes)

# 3. Encoding Categorical Variables

# Print column names to verify
print("Categorical Columns: ", categorical_cols)

# One-Hot Encoding (for nominal categorical variables)
# Identify actual nominal columns and update the list
nominal_columns = [col for col in categorical_cols if df[col].nunique() < 10]  # Example criterion
df = pd.get_dummies(df, columns=nominal_columns, drop_first=True)

# Label Encoding (for ordinal categorical variables)
# Identify actual ordinal columns and update the list
ordinal_columns = []  # Replace with actual ordinal column names if any
for col in ordinal_columns:
    if col in df.columns:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])

# 4. Normalizing Numerical Features

# Choose between Min-Max Scaling or Standard Scaling
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Ensure no NaN values before scaling
df[numerical_cols] = df[numerical_cols].fillna(0)  # Fill NaNs with 0 or another appropriate value

# Min-Max Scaling
min_max_scaler = MinMaxScaler()
df[numerical_cols] = min_max_scaler.fit_transform(df[numerical_cols])

# Alternatively, you can use Standard Scaling
# standard_scaler = StandardScaler()
# df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

# 5. Removing Outliers

# Using IQR Method
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]

# Alternatively, use Z-Score Method
# df = df[(np.abs(stats.zscore(df[numerical_cols])) < 3).all(axis=1)]

# Save the cleaned dataset
df.to_csv('cleaned_merged_car.csv', index=False)

print("Data preprocessing completed successfully.")



Data Types of Columns:
car_links                  object
fuel_type                  object
body_type                  object
kilometers_driven         float64
transmission               object
owner                      object
oem                        object
model                      object
year                        int64
variant                    object
price                      object
registration_year         float64
insurance_validity         object
fuel_type_overview         object
seats                     float64
kms_driven                  int64
rto                        object
comfort_convenience        object
interior_features          object
exterior_features          object
safety_features            object
entertainment_features     object
mileage                   float64
engine                    float64
max_power                  object
torque                    float64
wheel_size                float64
bhp                       float64
rpm                      

In [3]:
pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 837.5 kB/s eta 0:00:13
   -- ------------------------------------- 0.8/11.0 MB