In [None]:
#Let's start by creating a sample dataset for superbikes manufacturing, followed by performing machine learning tasks, including applying the K-Nearest Neighbors (KNN) algorithm.

Step 1: Create a Sample Dataset
We'll create a dataset with the following columns:

Manufacturer: The brand of the superbike.
Model: The model name of the superbike.
Engine Capacity (cc): The engine displacement in cubic centimeters.
Top Speed (km/h): The top speed of the superbike.
Price (USD): The price of the superbike.
Category: Classifying the superbike as 'Budget', 'Mid-Range', or 'Premium'.
Step 2: Perform Data Observation and Preprocessing
Step 3: Apply KNN Algorithm
Step 4: Explain Each Cell
I'll walk you through the code, explaining each step in detail. Let's start coding!

In [3]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
#pandas and numpy: Libraries used for data manipulation.


In [4]:
# Step 2: Create the Sample Dataset
data = {
    'Manufacturer': ['Yamaha', 'Honda', 'Ducati', 'Kawasaki', 'BMW', 'Suzuki', 'Harley-Davidson', 'KTM', 'Triumph', 'Aprilia'],
    'Model': ['YZF-R1', 'CBR1000RR', 'Panigale V4', 'Ninja H2', 'S1000RR', 'GSX-R1000', 'Street Glide', 'RC 390', 'Daytona 675', 'RSV4'],
    'Engine Capacity (cc)': [998, 999, 1103, 998, 999, 999, 1868, 373, 675, 1078],
    'Top Speed (km/h)': [299, 299, 299, 400, 299, 300, 177, 167, 265, 305],
    'Price (USD)': [17399, 16199, 21995, 29000, 16995, 15499, 21999, 5599, 13200, 24999],
    'Category': ['Premium', 'Mid-Range', 'Premium', 'Premium', 'Premium', 'Mid-Range', 'Premium', 'Budget', 'Mid-Range', 'Premium']
}

df = pd.DataFrame(data)
print(df)


#Sample Dataset: We created a DataFrame with hypothetical data for superbikes from different manufacturers, including specifications like engine capacity, top speed, price, and category.

      Manufacturer         Model  Engine Capacity (cc)  Top Speed (km/h)  \
0           Yamaha        YZF-R1                   998               299   
1            Honda     CBR1000RR                   999               299   
2           Ducati   Panigale V4                  1103               299   
3         Kawasaki      Ninja H2                   998               400   
4              BMW       S1000RR                   999               299   
5           Suzuki     GSX-R1000                   999               300   
6  Harley-Davidson  Street Glide                  1868               177   
7              KTM        RC 390                   373               167   
8          Triumph   Daytona 675                   675               265   
9          Aprilia          RSV4                  1078               305   

   Price (USD)   Category  
0        17399    Premium  
1        16199  Mid-Range  
2        21995    Premium  
3        29000    Premium  
4        16995    Premi

In [11]:
# Step 3: Data Observation
print("Dataset Overview:\n", df.describe())
print("\nDataset Info:\n")
df.info()
#df.describe(): Gives a summary of statistics for numerical columns.
#df.info(): Provides information about the dataset, including data types and non-null counts.


Dataset Overview:
        Engine Capacity (cc)  Top Speed (km/h)   Price (USD)  Category
count             10.000000         10.000000     10.000000       0.0
mean            1009.000000        281.000000  18288.400000       NaN
std              375.362343         67.050561   6572.678385       NaN
min              373.000000        167.000000   5599.000000       NaN
25%              998.000000        273.500000  15674.000000       NaN
50%              999.000000        299.000000  17197.000000       NaN
75%             1058.250000        299.750000  21998.000000       NaN
max             1868.000000        400.000000  29000.000000       NaN

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Manufacturer          10 non-null     object 
 1   Model                 10 non-null     object 
 2   Engine Capacity (cc)  10 

In [14]:
# Step 4: Preprocessing the Data
# Encoding categorical data
df['Category'] = df['Category'].map({'Budget': 0, 'Mid-Range': 1, 'Premium': 2})

# Features and Labels
X = df[['Engine Capacity (cc)', 'Top Speed (km/h)', 'Price (USD)']]
y = df['Category']

# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the Data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Encoding: The 'Category' column is encoded numerically for machine learning.
#Features and Labels: Features (X) are selected as engine capacity, top speed, and price. Labels (y) are the categories.
#Data Splitting: The dataset is split into training and testing sets.
#Standardization: The features are standardized to ensure they have a mean of 0 and variance of 1.

In [27]:
# Step 5: Handling Missing Values in the Target Variable

# Check for missing values in the target variable
print("Checking for missing values in y_train:", y_train.isnull().sum())

# If there are missing values, drop them (or handle them accordingly)
y_train = y_train.dropna()

# Ensure X_train matches the length of y_train after dropping any rows with missing y values
X_train = X_train[:len(y_train)]

# Implementing KNN after handling missing values
knn = KNeighborsClassifier(n_neighbors=3)  # Initialize KNN with 3 neighbors
knn.fit(X_train, y_train)  # Train the model with the training data

# Check if the model was trained successfully by printing a confirmation
print("KNN model trained successfully!")


Checking for missing values in y_train: 8


ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required by KNeighborsClassifier.

## 

In [28]:
# Step 6: Predicting and Evaluating the Model
y_pred = knn.predict(X_test)

# Accuracy Score
print("The accuracy score is : ", accuracy_score(y_test, y_pred))

# Classification Report
print(classification_report(y_test, y_pred))


NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.