Importing necessary libraries

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


Loading the dataset and displaying it

In [5]:

from ucimlrepo import fetch_ucirepo

# fetch dataset
car_evaluation = fetch_ucirepo(id=19)

# data (as pandas dataframes)
X = car_evaluation.data.features
y = car_evaluation.data.targets

# metadata
print(car_evaluation.metadata)

# variable information
print(car_evaluation.variables)


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'published_in': '8th Intl Workshop on Expert Systems and their Applications, Avignon, France', 'yea

Validating datasets

In [6]:
missing_values = car_evaluation.data.features.isnull().sum()
print("Missing values:")
print(missing_values)

# Check data types
print("Data types:")
print(car_evaluation.data.features.dtypes)

# Check for duplicate entries
duplicate_rows = car_evaluation.data.features.duplicated().sum()
print("Duplicate entries:")
print(duplicate_rows)

Missing values:
buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
dtype: int64
Data types:
buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
dtype: object
Duplicate entries:
0


Summarizing the dataset statistics

In [7]:
# Data Summarization
# Calculate basic statistics
print("Basic statistics:")
print(car_evaluation.data.features.describe())

# Count the frequency of unique values for categorical columns
for column in car_evaluation.data.features.columns:
    if car_evaluation.variables.loc[car_evaluation.variables['name'] == column]['type'].values[0] == 'Categorical':
        print("Frequency of unique values for column", column)
        print(car_evaluation.data.features[column].value_counts())


Basic statistics:
       buying  maint doors persons lug_boot safety
count    1728   1728  1728    1728     1728   1728
unique      4      4     4       3        3      3
top     vhigh  vhigh     2       2    small    low
freq      432    432   432     576      576    576
Frequency of unique values for column buying
vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64
Frequency of unique values for column maint
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64
Frequency of unique values for column doors
2        432
3        432
4        432
5more    432
Name: doors, dtype: int64
Frequency of unique values for column persons
2       576
4       576
more    576
Name: persons, dtype: int64
Frequency of unique values for column lug_boot
small    576
med      576
big      576
Name: lug_boot, dtype: int64
Frequency of unique values for column safety
low     576
med     576
high    576
Name: safety, dtype: int64


Calculating correlation matrix

In [8]:
# Data Correlation
# Calculate correlation coefficients
correlation_matrix = car_evaluation.data.features.corr()
print("Correlation matrix:")
print(correlation_matrix)


Correlation matrix:
Empty DataFrame
Columns: []
Index: []


  correlation_matrix = car_evaluation.data.features.corr()


Splitting the dataset into Train and Test sets

In [12]:
car_evaluation = fetch_ucirepo(id=19)

# Data (as pandas dataframes)
X = car_evaluation.data.features
y = car_evaluation.data.targets

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X)

# Encode the target variable if it's categorical
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Normalize or scale numerical features if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Split the dataset into train and test datasets after data cleaning
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (1382, 21)
Testing set shape: (346, 21)


  y = column_or_1d(y, warn=True)
