In [59]:
# First step is to Import the packages

import pandas as pd
import numpy as np

In [66]:
# =====================================================================
# PHASE 1: HIGH-LEVEL INSPECTION (Getting the Landscape)
# =====================================================================

In [67]:
# Load the dataset
df = pd.read_csv(r'C:\Users\prath\FullStackDataScience\Data-Science\EDA\data_sets\quikr_car.csv')

In [68]:
# Analyst thought process (What you would run in a Jupyter cell):
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   name        892 non-null    str  
 1   company     892 non-null    str  
 2   year        892 non-null    str  
 3   Price       892 non-null    str  
 4   kms_driven  840 non-null    str  
 5   fuel_type   837 non-null    str  
dtypes: str(6)
memory usage: 41.9 KB


In [69]:
# -> Dtype observation: 'year', 'Price', and 'kms_driven' are 'object' (strings). 
# -> Conclusion: They contain garbage text and need to be forced into integers.

In [70]:
# Check for the duplicates 
df[df.duplicated()]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
14,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
15,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
20,Mahindra Scorpio S10,Mahindra,2016,350000,"43,000 kms",Diesel
24,Hyundai i20 Sportz 1.2,Hyundai,2012,100000,"55,000 kms",Petrol
25,Hyundai i20 Sportz 1.2,Hyundai,2012,100000,"55,000 kms",Petrol
...,...,...,...,...,...,...
626,Tata Sumo Gold EX BS IV,Tata,2012,210000,"75,000 kms",Diesel
641,Maruti Suzuki Swift VDi BS IV,Maruti,2012,280000,"48,006 kms",Diesel
727,Mahindra Scorpio S4,Mahindra,2015,865000,"30,000 kms",Diesel
861,Hyundai Getz Prime 1.3 GLX,Hyundai,2009,115000,"20,000 kms",Petrol


In [71]:
# Drop identical duplicate rows right at the start to reduce processing
df = df.drop_duplicates(keep="first").reset_index(drop=True)

In [72]:
# =====================================================================
# PHASE 2 & 3: INVESTIGATION & CLEANING EXECUTION (Column by Column)
# =====================================================================

In [None]:
# --- 1. The 'year' Column ---

df['year'].unique()
# Investigation: Running df['year'].unique() shows non-numeric text like 'I', 'zest', 'sale'.

<StringArray>
['2007', '2006', '2018', '2014', '2015', '2012', '2013', '2016', '2010',
 '2017', '2008', '2011', '2019', '2009', '2005', '2000',  '...', '150k',
 'TOUR', '2003', 'r 15', '2004', 'Zest', '/-Rs', 'sale', '1995', 'ara)',
 '2002', 'SELL', '2001', 'tion', 'odel', '2 bs', 'arry',  'Eon', 'o...',
 'ture',  'emi',  'car', 'able',  'no.', 'd...', 'SALE', 'digo', 'sell',
 'd Ex', 'n...', 'e...', 'D...', ', Ac', 'go .', 'k...', 'o c4', 'zire',
 'cent', 'Sumo',  'cab', 't xe',  'EV2', 'r...', 'zest']
Length: 61, dtype: str

In [74]:
# Execution: We use pd.to_numeric with errors='coerce'. 
# This forces valid strings (like '2019') into numbers and turns garbage text into NaN.
df['year'] = pd.to_numeric(df['year'], errors='coerce')

In [None]:
# --- 2. The 'Price' Column ---

df['Price'].unique()
# Investigation: Running df['Price'].unique() shows commas ('4,25,000') and text ('Ask For Price').

<StringArray>
[       '80,000',      '4,25,000', 'Ask For Price',      '3,25,000',
      '5,75,000',      '1,75,000',      '1,90,000',      '8,30,000',
      '2,50,000',      '1,82,000',
 ...
        '42,000',      '1,89,000',      '1,62,000',        '35,999',
     '29,00,000',        '39,999',        '50,500',      '5,10,000',
      '8,60,000',      '5,00,001']
Length: 274, dtype: str

In [76]:
# Execution Step 1: Convert to string, then replace commas with nothing.
df['Price'] = df['Price'].astype(str).str.replace(',', '')

In [77]:
# Execution Step 2: Force to numeric. 'Ask For Price' will automatically become NaN.
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

In [None]:
# --- 3. The 'kms_driven' Column ---

df['kms_driven'].unique()
# Investigation: df['kms_driven'].unique() shows commas, ' kms', and shifted text like 'Petrol'.

<StringArray>
[  '45,000 kms',       '40 kms',   '22,000 kms',   '28,000 kms',
   '36,000 kms',   '59,000 kms',   '41,000 kms',   '25,000 kms',
   '24,530 kms',   '60,000 kms',
 ...
   '60,123 kms',   '38,900 kms', '1,37,495 kms',   '91,200 kms',
 '1,46,000 kms', '1,00,800 kms',    '2,100 kms',    '2,500 kms',
 '1,32,000 kms',       'Petrol']
Length: 259, dtype: str

In [79]:
# Execution Step 1: Remove commas and the word ' kms'.
df['kms_driven'] = df['kms_driven'].astype(str).str.replace(',', '').str.replace(' kms', '')

In [80]:
# Execution Step 2: Force to numeric. Text like 'Petrol' automatically becomes NaN.
df['kms_driven'] = pd.to_numeric(df['kms_driven'], errors='coerce')

In [None]:
# --- 4. The 'company' Column ---

df['company'].unique()
# Investigation: Due to messy CSV data, some company names are just numbers.

<StringArray>
[   'Hyundai',   'Mahindra',     'Maruti',       'Ford',      'Skoda',
       'Audi',     'Toyota',    'Renault',      'Honda',     'Datsun',
 'Mitsubishi',       'Tata', 'Volkswagen',          'I',  'Chevrolet',
       'Mini',        'BMW',     'Nissan',  'Hindustan',       'Fiat',
 'Commercial',     'MARUTI',      'Force',   'Mercedes',       'Land',
     'Yamaha',    'selling',     'URJENT',      'Swift',       'Used',
     'Jaguar',       'Jeep',       'tata',       'Sale',       'very',
      'Volvo',          'i',       '2012',       'Well',        'all',
          '7',          '9',    'scratch',     'urgent',       'sell',
       'TATA',        'Any',       'Tara']
Length: 48, dtype: str

In [84]:
# Execution: We find rows where the company is purely numeric and exclude them using ~ (NOT).
# .fillna(False) is required so we don't pass NaNs into the boolean filter.
df[df['company'].astype(str).str.isnumeric().fillna(False)]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
700,2012 Tata Sumo Gold f,2012,,250000.0,,
706,7 SEATER MAHINDRA BOLERO IN VERY GOOD,7,,,,
707,9 SEATER MAHINDRA BOL,9,,,,


In [85]:
df = df[~df['company'].astype(str).str.isnumeric().fillna(False)]

In [86]:
# --- 5. The 'name' Column ---
df['name'].unique()
# Investigation: Car names are too long and specific (e.g., 'Hyundai Santro Xing XO eRLX Euro III').

<StringArray>
[  'Hyundai Santro Xing XO eRLX Euro III',
                'Mahindra Jeep CL550 MDI',
             'Maruti Suzuki Alto 800 Vxi',
 'Hyundai Grand i10 Magna 1.2 Kappa VTVT',
       'Ford EcoSport Titanium 1.5L TDCi',
                              'Ford Figo',
                            'Hyundai Eon',
       'Ford EcoSport Ambiente 1.5L TDCi',
         'Maruti Suzuki Alto K10 VXi AMT',
            'Skoda Fabia Classic 1.2 MPI',
 ...
         'Mercedes Benz C Class 200 K MT',
                            'Skoda Fabia',
  'Maruti Suzuki Alto 800 Select Variant',
             'Maruti Suzuki Ritz VXI ABS',
                       'tata zest 2017 f',
              'Tata Indica V2 DLE BS III',
                                     'Ta',
                    'Tata Zest XM Diesel',
               'Honda Amaze 1.2 E i VTEC',
              'Chevrolet Sail 1.2 LT ABS']
Length: 522, dtype: str

In [87]:
# Execution: We split the string into a list of words, take the first 3 words, and join them back.
df['name'] = df['name'].str.split().str[:3].str.join(' ')

In [88]:
# --- 6. The 'fuel_type' Column ---
df.isnull().sum()
# Investigation: df.isnull().sum() shows fuel_type has missing values.

name           0
company        0
year          46
Price         31
kms_driven    49
fuel_type     50
dtype: int64

In [89]:
# Execution: We filter the dataframe to keep only rows where fuel_type is NOT missing.
df = df[~df['fuel_type'].isna()]

In [90]:
# =====================================================================
# PHASE 4: FINAL CLEANUP & DATA TYPE CASTING
# =====================================================================

In [94]:
# Step 1: Drop all the NaN values we deliberately created using 'coerce' in Phase 3
df = df.dropna().reset_index(drop=True)

In [96]:
# Check all the null values rows is gone and data frame is clean 
df.isnull().sum()

name          0
company       0
year          0
Price         0
kms_driven    0
fuel_type     0
dtype: int64

In [97]:
# Step 2: Now that the garbage text is gone and NaNs are dropped, 
# we can safely cast these columns to absolute Integers for machine learning or analysis.
df['year'] = df['year'].astype(int)
df['Price'] = df['Price'].astype(int)
df['kms_driven'] = df['kms_driven'].astype(int)

In [98]:
# =====================================================================
# VERIFICATION
# =====================================================================

In [103]:
print("Cleaned Data Info:")
df.info()

Cleaned Data Info:
<class 'pandas.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        724 non-null    object
 1   company     724 non-null    str   
 2   year        724 non-null    int64 
 3   Price       724 non-null    int64 
 4   kms_driven  724 non-null    int64 
 5   fuel_type   724 non-null    str   
dtypes: int64(3), object(1), str(2)
memory usage: 34.1+ KB


In [104]:
print("\nFirst 5 rows of Cleaned Data:")
df.head()


First 5 rows of Cleaned Data:


Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
2,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
3,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
4,Ford Figo,Ford,2012,175000,41000,Diesel
