# Importing Libraries

In [7]:
import numpy as np
import pandas as pd
import random

# 3.3.2 The Homework Exercises

## 1. Preparation (Setting the Random State)


In [3]:
np.random.seed(289456)

## 2. Exercise 1 (Loading and Preparing the Data)

In [12]:
# Load the CSV file into a DataFrame
df_tot = pd.read_csv('cla4lsp_bikez_curated.csv')

# The Shape of the dataframe
print(f"Shape of the dataset: {df_tot.shape}")

# Select a random integer between 0, 1, and 2
r = random.randint(0, 2)

# Create a sub-DataFrame (workdf) containing data corresponding to years with a remainder of r when divided by three
workdf = df_tot[df_tot['Year'] % 3 == r]

# Define the labels and features columns
labels = ['Brand', 'Model', 'Year', 'Category', 'Rating']
features = [col for col in workdf.columns if col not in labels]

# Select two random columns from the features
columns_to_remove = random.sample(features, 2)

# Remove the selected columns from workdf
workdf = workdf.drop(columns=columns_to_remove)




In [10]:
df_tot.columns

Index(['Brand', 'Model', 'Year', 'Category', 'Rating', 'Displacement (ccm)',
       'Power (hp)', 'Torque (Nm)', 'Engine cylinder', 'Engine stroke',
       'Gearbox', 'Bore (mm)', 'Stroke (mm)', 'Fuel capacity (lts)',
       'Fuel control', 'Cooling system', 'Transmission type',
       'Dry weight (kg)', 'Wheelbase (mm)', 'Seat height (mm)', 'Fuel system',
       'Front brakes', 'Rear brakes', 'Front tire', 'Rear tire',
       'Front suspension', 'Rear suspension'],
      dtype='object')

In [23]:
missing_values_count = workdf.isnull().sum()
print(missing_values_count)
print("---------------------------")
distinct_values_count = workdf.nunique()
print(distinct_values_count)


Brand                  0
Model                  0
Year                   0
Category               0
Rating                 0
Displacement (ccm)     0
Power (hp)             0
Torque (Nm)            0
Engine cylinder        0
Engine stroke          0
Gearbox                0
Bore (mm)              0
Fuel capacity (lts)    0
Fuel control           0
Cooling system         0
Transmission type      0
Dry weight (kg)        0
Wheelbase (mm)         0
Seat height (mm)       0
Fuel system            0
Front brakes           0
Front tire             0
Rear tire              0
Front suspension       0
Rear suspension        0
dtype: int64
---------------------------
Brand                   61
Model                  979
Year                    18
Category                15
Rating                  23
Displacement (ccm)     287
Power (hp)             339
Torque (Nm)            297
Engine cylinder         11
Engine stroke            3
Gearbox                  7
Bore (mm)              114
Fuel capac

In [22]:
# Fill missing values in categorical columns with a designated category (e.g., "Unknown")
categorical_columns = workdf.select_dtypes(include=['object']).columns
workdf[categorical_columns] = workdf[categorical_columns].fillna("Unknown")

# Drop rows with missing values in numeric columns
numeric_columns = workdf.select_dtypes(include=['int64', 'float64']).columns
workdf = workdf.dropna(subset=numeric_columns)


In [14]:
categorical_columns


Index(['Brand', 'Model', 'Category', 'Engine cylinder', 'Engine stroke',
       'Gearbox', 'Fuel control', 'Cooling system', 'Transmission type',
       'Fuel system', 'Front brakes', 'Front tire', 'Rear tire',
       'Front suspension', 'Rear suspension'],
      dtype='object')

In [17]:
distinct_values_count = workdf[categorical_columns].nunique()
print(distinct_values_count)

Brand                 450
Model                9156
Category               18
Engine cylinder        22
Engine stroke          10
Gearbox                14
Fuel control           15
Cooling system          4
Transmission type       4
Fuel system             8
Front brakes           80
Front tire             10
Rear tire               6
Front suspension        7
Rear suspension        11
dtype: int64


# Exercise 2 (Encoding of Categorical Data)

In [24]:
workdf.head()


Unnamed: 0,Brand,Model,Year,Category,Rating,Displacement (ccm),Power (hp),Torque (Nm),Engine cylinder,Engine stroke,...,Transmission type,Dry weight (kg),Wheelbase (mm),Seat height (mm),Fuel system,Front brakes,Front tire,Rear tire,Front suspension,Rear suspension
47,adiva,ad3 400,2018,Scooter,3.3,399.0,36.7,38.1,Single cylinder,four-stroke,...,Not Given/Unknown,257.0,1560.0,770.0,injection,single disc,other,other,other,other
184,aeon,cobra 220,2009,ATV,3.1,196.0,12.1,13.6,Single cylinder,four-stroke,...,Chain,193.0,1155.0,790.0,carburettor. other,double disc,other,other,other,other
187,aeon,cobra 320,2009,ATV,2.7,272.3,19.4,19.0,Single cylinder,four-stroke,...,Chain,211.0,1300.0,790.0,carburettor. other,double disc,other,other,other,other
189,aeon,cobra 400,2012,ATV,3.4,346.0,20.1,30.0,Single cylinder,four-stroke,...,Not Given/Unknown,207.0,1300.0,790.0,carburettor. other,double disc,other,other,other,other
193,aeon,cobra 50,2009,ATV,3.5,49.0,3.6,4.5,Single cylinder,two-stroke,...,Shaft drive,130.0,1050.0,800.0,carburettor. other,expanding brake (drum brake),other,other,other,other
