In [1]:
# Import necessary libraries
# Pandas is used for data loading and manipulation
import pandas as pd

# Load the Car Evaluation dataset from CSV file
df = pd.read_csv("cars.csv")

# Preview the first 5 rows of the dataset
# This helps verify that the data loaded correctly and allows a first look at structure
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [2]:
# Display the structure and summary of the dataset
# This shows how many rows/columns there are, the column names, data types, and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [3]:
# View summary statistics for all categorical (object-type) columns
# This reveals the number of unique values per feature and their most common category
df.describe(include='object')

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [4]:
# Check for missing values in the dataset
# A clean dataset with zero missing values means we can skip imputation for now
df.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [7]:
# Convert all columns to 'category' data type
# This saves memory and reinforces that these are categorical features, not continuous
cat_cols = df.columns
df[cat_cols] = df[cat_cols].astype('category')

# Confirm updated data types
df.dtypes

buying      category
maint       category
doors       category
persons     category
lug_boot    category
safety      category
class       category
dtype: object

In [8]:
# Import NumPy for numerical operations
import numpy as np

# Use NumPy to extract and display unique values in the target column ('class')
# This confirms the classification labels we’ll be working with later
unique_classes = np.unique(df['class'])
print("Unique classes in the target variable:", unique_classes)

Unique classes in the target variable: ['acc' 'good' 'unacc' 'vgood']


In [9]:
# Explore the distribution of the target variable ('class')
# Helps us check for class imbalance, which is important for model fairness later
df['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64