In [1]:
# import the necessary modules
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
# read the data
data = pd.read_csv("usedCars.csv")

# take a look at first 5 rows
data.head()

Unnamed: 0,name,location,model_year,mileage,engine_type,transmission,registered_in,color,assembly,engine_capacity,body_type,other_features_list,url,price,phone_number
0,Toyota Fortuner Legender 2022,"I- 8, Islamabad Islamabad",2022,5 km,Diesel,Automatic,Un-Registered,White,Local,2800 cc,SUV,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/toyota-for...,,923334567890
1,Toyota Premio X EX Package 1.8 2018,"Askari 6, Peshawar KPK",2018,"17,000 km",Petrol,Automatic,Un-Registered,Peral White,Imported,1800 cc,Sedan,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/toyota-pre...,PKR 85 lacs,923334567890
2,Honda City Aspire 1.3 i-VTEC 2016,"I- 8, Islamabad Islamabad",2016,"59,000 km",Petrol,Manual,Islamabad,White,Local,1300 cc,Sedan,"ABS,AM/FM Radio,Air Conditioning,Alloy Rims,CD...",https://www.pakwheels.com/used-cars/honda-city...,PKR 23.75 lacs,923334567890
3,Suzuki Bolan VX Euro II 2018,"Dhok Sayedan Road, Rawalpindi Punjab",2018,"55,000 km",Petrol,Manual,Islamabad,White,Local,800 cc,Van,"AM/FM Radio,Immobilizer Key",https://www.pakwheels.com/used-cars/suzuki-bol...,PKR 10.5 lacs,923334567890
4,Suzuki Swift GLX CVT 2022,Karachi Sindh,2022,2 km,Petrol,Automatic,Sindh,Grey,Local,1200 cc,Hatchback,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/suzuki-swi...,PKR 35.5 lacs,923334567890


In [3]:
# phone number is a private thing so I will remove `phone_number` column
data.drop("phone_number", axis=1, inplace=True)

# take a look at data
data.head()

Unnamed: 0,name,location,model_year,mileage,engine_type,transmission,registered_in,color,assembly,engine_capacity,body_type,other_features_list,url,price
0,Toyota Fortuner Legender 2022,"I- 8, Islamabad Islamabad",2022,5 km,Diesel,Automatic,Un-Registered,White,Local,2800 cc,SUV,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/toyota-for...,
1,Toyota Premio X EX Package 1.8 2018,"Askari 6, Peshawar KPK",2018,"17,000 km",Petrol,Automatic,Un-Registered,Peral White,Imported,1800 cc,Sedan,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/toyota-pre...,PKR 85 lacs
2,Honda City Aspire 1.3 i-VTEC 2016,"I- 8, Islamabad Islamabad",2016,"59,000 km",Petrol,Manual,Islamabad,White,Local,1300 cc,Sedan,"ABS,AM/FM Radio,Air Conditioning,Alloy Rims,CD...",https://www.pakwheels.com/used-cars/honda-city...,PKR 23.75 lacs
3,Suzuki Bolan VX Euro II 2018,"Dhok Sayedan Road, Rawalpindi Punjab",2018,"55,000 km",Petrol,Manual,Islamabad,White,Local,800 cc,Van,"AM/FM Radio,Immobilizer Key",https://www.pakwheels.com/used-cars/suzuki-bol...,PKR 10.5 lacs
4,Suzuki Swift GLX CVT 2022,Karachi Sindh,2022,2 km,Petrol,Automatic,Sindh,Grey,Local,1200 cc,Hatchback,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/suzuki-swi...,PKR 35.5 lacs


In [4]:
# shape of data
data.shape

(75899, 14)

#### Checking for null values

In [5]:
# Check out the number of null values in column
def null(data):
    no_of_null_values = data.isnull().sum()
    percentage_of_null_values = np.round((data.isnull().sum()) / len(data), 4) * 100
    return pd.concat([no_of_null_values, percentage_of_null_values], axis=1, keys=['no_of_null_values', 'percentage'])

null(data)

Unnamed: 0,no_of_null_values,percentage
name,0,0.0
location,0,0.0
model_year,0,0.0
mileage,0,0.0
engine_type,0,0.0
transmission,0,0.0
registered_in,0,0.0
color,0,0.0
assembly,0,0.0
engine_capacity,0,0.0


#### Fixing `price` column

In [6]:
def fix_price(data):
    
    # remove the sub-string 'PKR' from prices of cars
    data.loc[:, 'price'] = data['price'].str.replace('PKR', '')
    
    # separate cars with unknown price
    price_null = data.price.isnull()
    price_null_data = data[price_null.values]
    
    # cars with prices known
    price_present = data.dropna(axis=0, subset=['price'])
    
    # separate cars with prices in lacs
    price_lacs = price_present['price'].str.contains('lacs')
    price_lacs_data = price_present[price_lacs.values]
    # remove the sub-string 'lacs' from prices
    price_lacs_data.loc[:, 'price'] = price_lacs_data['price'].str.replace('lacs', '')
    # change the data type of `price` column to float64
    price_lacs_data.loc[:, 'price'] = price_lacs_data['price'].astype(np.float64)
    # multiply each price with 1 lac i.e. 100000
    price_lacs_data.loc[:, 'price'] = price_lacs_data['price'] * 100_000
    
    # separate cars with prices in crores
    price_crores = price_present['price'].str.contains('crore')
    price_crores_data = price_present[price_crores.values]
    # remove the sub-string 'crore' from prices of cars
    price_crores_data.loc[:, 'price'] = price_crores_data['price'].str.replace('crore', '')
    # change the data type of `price` column to float64
    price_crores_data.loc[:, 'price'] = price_crores_data['price'].astype(np.float64)
    # multiply each price with 100 lac i.e. 10000000
    price_crores_data.loc[:, 'price'] = price_crores_data['price'] * 10_000_000
    
    # return the complete data by combining the cars with prices in lacs, crores and also the cars with price not available 
    return pd.concat([price_null_data, price_lacs_data, price_crores_data], axis=0).sort_index()

final_data = fix_price(data)
final_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,name,location,model_year,mileage,engine_type,transmission,registered_in,color,assembly,engine_capacity,body_type,other_features_list,url,price
0,Toyota Fortuner Legender 2022,"I- 8, Islamabad Islamabad",2022,5 km,Diesel,Automatic,Un-Registered,White,Local,2800 cc,SUV,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/toyota-for...,
1,Toyota Premio X EX Package 1.8 2018,"Askari 6, Peshawar KPK",2018,"17,000 km",Petrol,Automatic,Un-Registered,Peral White,Imported,1800 cc,Sedan,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/toyota-pre...,8500000.0
2,Honda City Aspire 1.3 i-VTEC 2016,"I- 8, Islamabad Islamabad",2016,"59,000 km",Petrol,Manual,Islamabad,White,Local,1300 cc,Sedan,"ABS,AM/FM Radio,Air Conditioning,Alloy Rims,CD...",https://www.pakwheels.com/used-cars/honda-city...,2375000.0
3,Suzuki Bolan VX Euro II 2018,"Dhok Sayedan Road, Rawalpindi Punjab",2018,"55,000 km",Petrol,Manual,Islamabad,White,Local,800 cc,Van,"AM/FM Radio,Immobilizer Key",https://www.pakwheels.com/used-cars/suzuki-bol...,1050000.0
4,Suzuki Swift GLX CVT 2022,Karachi Sindh,2022,2 km,Petrol,Automatic,Sindh,Grey,Local,1200 cc,Hatchback,"ABS,AM/FM Radio,Air Bags,Air Conditioning,Allo...",https://www.pakwheels.com/used-cars/suzuki-swi...,3550000.0


#### Saving the final prepared data to a csv file

In [7]:
final_data.to_csv("usedCarsFinal.csv", index=False)