In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Bike Price Prediction(bikroy.com-NSU)/Data/motorbike_data_final.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
fig_dims = (10, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sns.boxplot(x = "Brand", y = "Price", ax=ax, data=df)

In [None]:
# Create a function to bin years into 5-year intervals
def year_group(year):
    # Convert to numeric, handling any potential non-numeric values
    try:
        year = float(year)
        # Ensure years are between 1970 and 2024
        year = max(1970, min(year, 2024))
        return f"{int((year // 5) * 5)}-{int((year // 5) * 5 + 4)}"
    except (ValueError, TypeError):
        return 'Unknown'

# Add a new column with 5-year grouped years
dfm = df.copy()
dfm['Year_Group'] = dfm['Manufactured_year'].apply(year_group)

# Plot
plt.figure(figsize=(15, 7))
# Sort the year groups, placing 'Unknown' at the end if it exists
sorted_groups = sorted(
    dfm['Year_Group'].unique(),
    key=lambda x: (x == 'Unknown', x) if x != 'Unknown' else (True, x)
)

sns.boxplot(x="Year_Group", y="Price", data=dfm, order=sorted_groups)

plt.title('Bike Prices by 5-Year Manufactured Year Intervals (1970-2024)', fontsize=16)
plt.xlabel('Manufactured Year Group', fontsize=16)
plt.ylabel('Price', fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
class_distribution = df['Brand'].value_counts()
print(class_distribution)
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

# Plot the class distribution
class_distribution.plot(kind='bar')
plt.xlabel('Brand')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.show()



```
# This is formatted as code
```

## Label encoding


In [None]:
categorical_columns = ['Brand','Model','Type']
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_enc = LabelEncoder()

# List of categorical columns to be encoded
cate_to_num = categorical_columns

# Apply label encoding to each categorical column and replace the original column with the encoded data
for col in cate_to_num:
    df[col] = label_enc.fit_transform(df[col])

In [None]:
df.head()

## Outliner remove

In [None]:
df.boxplot(figsize=(14,5))
plt.title(label='Outliner Distribution',
          fontsize=25,
          color="Red")

In [None]:
Q1 = df['Distance(km)'].quantile(0.25)
Q3 = df['Distance(km)'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
# Ensure that the lower limit is not less than the minimum value in the data
lower_limit = max(lower_limit, df['Distance(km)'].min())
upper_limit = Q3 + 1.5 * IQR
df['Distance(km)'] = np.where(
   df['Distance(km)'] > upper_limit,
   upper_limit,
   np.where(
       df['Distance(km)'] < lower_limit,
       lower_limit,
       df['Distance(km)']
    )
)

In [None]:
Q1 = df['Price'].quantile(0.25)
Q3 = df['Price'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
lower_limit = max(lower_limit, df['Distance(km)'].min())
upper_limit = Q3 + 1.5 * IQR
df['Price'] = np.where(
   df['Price'] > upper_limit,
   upper_limit,
   np.where(
       df['Price'] < lower_limit,
       lower_limit,
       df['Price']
    )
)

In [None]:
Q1 = df['Engine(cc)'].quantile(0.25)
Q3 = df['Engine(cc)'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR
df['Engine(cc)'] = np.where(
   df['Engine(cc)'] > upper_limit,
   upper_limit,
   np.where(
       df['Engine(cc)'] < lower_limit,
       lower_limit,
       df['Engine(cc)']
    )
)

In [None]:
df.boxplot(figsize=(14,5))
plt.title(label='Outliner Distribution',
          fontsize=25,
          color="Red")

In [None]:
selected_features = ['Brand','Manufactured_year','Distance(km)','Engine(cc)']
x = df[selected_features]
y = df["Price"]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

## ML Model/Algorithm Implementations

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.model_selection import GridSearchCV

#### Create and Train the Linear Regression Model


In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
predictions = lr.predict(x_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)*100

print('MAE (Mean Absolute Error): %s' %mae)
print('MSE (Mean Squared Error): %s' %mse)
print('RMSE (Root mean squared error): %s' %rmse)
print('R2 score: %s' %r2)

#### Create and Train the Decision Tree Regression Model



In [None]:
# Define the hyperparameters and their possible values
param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create the Decision Tree Regressor
regressor = DecisionTreeRegressor(random_state=0)

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the grid search to your training data
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
best_regressor = grid_search.best_estimator_

# Fit the best model to the data
best_regressor.fit(x_train, y_train)

# Now you can use the best model for predictions
y_pred_tr = best_regressor.predict(x_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred_tr)
mse = mean_squared_error(y_test, y_pred_tr)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_tr) *100

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R²): {r2}")

In [None]:
y_pred1=best_regressor.predict([[0,2018,20000,150]])
y_pred1

#### Create and Train the Random Forest Regression Model


In [None]:
rf_model = RandomForestRegressor(n_estimators = 100, random_state = 0)
rf_model.fit(x_train, y_train)
y_pred_rf = rf_model.predict(x_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_rf) *100

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R²): {r2}")

In [None]:
y_pred1=rf_model.predict([[0,2018,20000,150]])
y_pred1

### Create and Train the KNeighbors Regression Model

In [None]:
# Define a range of values for k to search over
param_grid = {'n_neighbors': range(3,20)}

# Create a KNN regressor
knn_regressor = KNeighborsRegressor()

# Create a grid search object
grid_search = GridSearchCV(knn_regressor, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Get the best value of 'k'
best_k = grid_search.best_params_['n_neighbors']
print(f"The best value of k is {best_k}")

# Create a KNN regressor with the best value of 'k' and train it
best_knn_regressor = KNeighborsRegressor(n_neighbors=best_k)
best_knn_regressor.fit(x_train, y_train)

# Make predictions with the best KNN regressor
y_pred_knn = best_knn_regressor.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, y_pred_knn)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_knn)
r2 = r2_score(y_test,y_pred_knn)*100

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R²): {r2}")

In [None]:
import pickle
model_filename = "/content/drive/MyDrive/Bike Price Prediction(bikroy.com-NSU)/Model/knn_model.pkl"  # Specify the filename for your model
# Save the model to a file
with open(model_filename, 'wb') as file:
    pickle.dump(best_knn_regressor, file)


In [None]:
y_pred1=best_knn_regressor.predict([[0,2017,46000,150]])
y_pred1

### Create and Train the Gradient Boosting Regression model

In [None]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)
gb_regressor.fit(x_train, y_train)
y_pred_gb = gb_regressor.predict(x_test)

mse = mean_squared_error(y_test, y_pred_gb)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_gb)
r2 = r2_score(y_test,y_pred_gb)*100

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R²): {r2}")

# Create and TrainSVM Regression Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import numpy as np

# Scale the features (important for SVR)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Define parameter grid for GridSearchCV
param_grid = {
    'kernel': ['rbf', 'linear', 'poly'],
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.5, 1]
}

# Create SVR model
svr = SVR()

# Perform Grid Search
grid_search = GridSearchCV(
    estimator=svr,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search
grid_search.fit(x_train_scaled, y_train)

# Get best model
best_svr = grid_search.best_estimator_

# Print best parameters
print("Best Parameters:", grid_search.best_params_)

# Make predictions
y_pred_svr = best_svr.predict(x_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred_svr)
mse = mean_squared_error(y_test, y_pred_svr)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_svr) * 100

print("\nSupport Vector Regression Results:")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared (R²): {r2}")

# Example prediction
example_input = [[0, 2018, 20000, 150]]
example_input_scaled = scaler.transform(example_input)
example_prediction = best_svr.predict(example_input_scaled)
print("\nExample Prediction:")
print(f"Predicted Price for the given input: {example_prediction[0]}")

# Importance of Features in Decision Tree and Gradient Boosting


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Updated feature names (matching the order in x)
feature_names = ['Brand', 'Model', 'Manufactured_year', 'Distance(km)', 'Engine(cc)']

# Recreate x with all columns
selected_features = feature_names
x = df[selected_features]

# Rerun train-test split with all features
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)

# Retrain Decision Tree with best parameters
best_regressor = DecisionTreeRegressor(**best_params, random_state=0)
best_regressor.fit(x_train, y_train)

# Retrain Gradient Boosting
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)
gb_regressor.fit(x_train, y_train)

# Decision Tree Feature Importance
print("Decision Tree Feature Importance:")
dt_importances = best_regressor.feature_importances_
for name, importance in zip(feature_names, dt_importances):
    print(f"{name}: {importance * 100:.2f}%")

# Create a bar plot for Decision Tree feature importances
plt.figure(figsize=(10,5))
plt.title("Decision Tree - Feature Importances")
plt.bar(feature_names, dt_importances)
plt.xticks(rotation=45)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

# Gradient Boosting Feature Importance
print("\nGradient Boosting Feature Importance:")
gb_importances = gb_regressor.feature_importances_
for name, importance in zip(feature_names, gb_importances):
    print(f"{name}: {importance * 100:.2f}%")

# Create a bar plot for Gradient Boosting feature importances
plt.figure(figsize=(10,5))
plt.title("Gradient Boosting - Feature Importances")
plt.bar(feature_names, gb_importances)
plt.xticks(rotation=45)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

# Combine importances into a DataFrame for easy comparison
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Decision Tree Importance': dt_importances*100,
    'Gradient Boosting Importance': gb_importances*100
})
print("\nComparative Feature Importance:")
print(importance_df)