# Pakistan Used Car Prices 2023 - Azure AutoML

## Data preparation

In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

In [2]:
PATH_DATA = os.path.join(os.getenv("PATH_DATA_PROJECTS"), "Tabular", "Pakistan_Used_Car_Prices_2023")

In [3]:
df = pd.read_csv(os.path.join(PATH_DATA, "pakwheels_used_car_data_cleaned.csv"))
print("Shape:", df.shape)
df.head()

Shape: (72670, 14)


Unnamed: 0,addref,city,assembly,body,make,model,year,engine,transmission,fuel,color,registered,mileage,price
0,7943732,Other,Local,Sedan,Toyota,Corolla,2013.0,1300.0,Manual,Petrol,Other,Lahore,145000,2870000.0
1,7730314,Lahore,Local,Sedan,Honda,Other,2000.0,1300.0,Manual,Petrol,Other,Lahore,230000,995000.0
2,7943737,Lahore,Local,Sedan,Toyota,Other,2021.0,1300.0,Manual,Petrol,Other,Punjab,60500,3585000.0
3,7943733,Lahore,Local,Hatchback,Suzuki,Other,2017.0,1300.0,Manual,Petrol,Other,Islamabad,87000,2250000.0
4,7923484,Lahore,Local,Sedan,Honda,Civic,2017.0,1800.0,Automatic,Petrol,Other,Lahore,86000,4850000.0


In [4]:
print("Total missing by columns:")
df.isnull().sum()

Total missing by columns:


addref          0
city            0
assembly        0
body            0
make            0
model           0
year            0
engine          0
transmission    0
fuel            0
color           0
registered      0
mileage         0
price           0
dtype: int64

### Categorical

In [5]:
from sklearn.preprocessing import OneHotEncoder

In [6]:
categorical_features = ["city", "assembly", "body", "make", "model", "transmission", "fuel", "color", "registered"]

In [7]:
# Get transformer instance 
# sparse_output = False to return the values as vectors
# drop = first to remove first class, because the rest of classes are enough explanatory and reduce training time 
onehot_encoder = OneHotEncoder(sparse_output = False, drop = "first")

# Training
encoded_features = onehot_encoder.fit_transform(df[categorical_features])

# Turn result into a dataframe
df_one_hot_encoder = pd.DataFrame(encoded_features, columns = onehot_encoder.get_feature_names_out())

### Numeric

In [8]:
from sklearn.preprocessing import MinMaxScaler

In [9]:
numeric_features = ["year", "engine", "mileage"]

In [10]:
scaler = MinMaxScaler()
ar = scaler.fit_transform(df[numeric_features])
df_numeric = pd.DataFrame(ar, columns = numeric_features)

### Get transformed dataset

In [11]:
target_variable = "price"

In [12]:
df_encoded = pd.concat([df_numeric, df_one_hot_encoder, df[target_variable]], axis = 1)
print("Shape:", df_encoded.shape)
df_encoded.head()

Shape: (72670, 27)


Unnamed: 0,year,engine,mileage,city_Karachi,city_Lahore,city_Other,assembly_Local,body_Other,body_Sedan,make_Other,...,fuel_Petrol,color_Silver,color_White,registered_Karachi,registered_Lahore,registered_Other,registered_Punjab,registered_Sindh,registered_Un-Registered,price
0,0.71875,0.085447,0.144999,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2870000.0
1,0.3125,0.085447,0.229999,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,995000.0
2,0.96875,0.085447,0.060499,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3585000.0
3,0.84375,0.085447,0.086999,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2250000.0
4,0.84375,0.118825,0.085999,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4850000.0


In [13]:
path_csv_train = "pakwheels_used_car_data_train.csv"
df_encoded.to_csv(os.path.join(PATH_DATA, path_csv_train), index = False)

### Load to storage

In [14]:
from azure.storage.blob import BlobClient

In [16]:
load_dotenv()
storage_account_name = os.getenv("STORAGE_ACCOUNT_NAME")
storage_account_key = os.getenv("STORAGE_ACCOUNT_KEY")
storage_container_name = os.getenv("CONTAINER_NAME")

connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
blob = BlobClient.from_connection_string(conn_str=connection_string, container_name=storage_container_name, blob_name=path_csv_train)

with open(os.path.join(PATH_DATA, path_csv_train), "rb") as data:
    blob.upload_blob(data)