**Introduction:** In CS82A, we were tasked with obtaining a dataset of interest from Kaggle and utilize the skills we had learned to gain insights into our data. In my first outing, I built linear regression models to showcase performance changes over time and relationships between features using various visualizations. In my second outing with this dataset, the following areas: 

1. **A more in-depth data cleaning & preprocessing stage:** Address null values with means, medians and modes, standardize categories where ncecessary, transform data, perform unit conversions. 
2. **Perform Pandas profiling to explore our data**
3. **Build a classification model** How well can a classification model distinguish between coupes, cabriolets, and targas?
4. **K Means clustering Machine Learning Model to predict body type (Coupe, Cabriolet, and Targa)**
5. **Regression Machine Learning model to predict top speeds**

# In-depth data cleaning & preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Import original Kaggle Data 
Kaggle_data = pd.read_csv('porsche_911.csv')
#Kaggle_data.columns

# Data cleaning & PreProcessing

In [None]:
#Data Cleaning

#Get rid of white spacing
Kaggle_data['body_type'] = Kaggle_data['body_type'].str.strip()

# Body type dictionary - need to have standardized body type classes for K Means clustering ML
body_typedict = {'Off-road vehicle, Coupe':'Coupe', 'Roadster': 'Coupe'}
Kaggle_data['body_type'].replace(body_typedict, inplace=True)

# Acceleration 0-62mph - eliminate ranges of data and use the median of the range
Acceleration_dict = {'4.0-4.2': '4.1', '3.5-3.7': '3.6', '4.4-4.6': '4.5', '4.5-4.7': '4.6', '3.4-3.6': '3.5'}
Kaggle_data['acceleration_0-62mph'].replace(Acceleration_dict, inplace=True)
Kaggle_data['acceleration_0-62mph'] = pd.to_numeric(Kaggle_data['acceleration_0-62mph'], errors='coerce')
median_value = Kaggle_data['acceleration_0-62mph'].median()
Kaggle_data['acceleration_0-62mph'].fillna(median_value, inplace=True)
Kaggle_data['acceleration_0-62mph'] = Kaggle_data['acceleration_0-62mph'].astype(float)

#Engine oil capacity
Kaggle_data['engine_oil_capacity'].fillna(Kaggle_data['engine_oil_capacity'].median(), inplace=True)

# Kerb weight - replace range of kerb weight to average of the top and bottom ends of the range
# Kerb weight nulls - replace kerb weight nulls with median
Kerb_weightdict = {'1140-1195': '1168'}
Kaggle_data['kerb_weight'].replace(Kerb_weightdict, inplace=True)
Kaggle_data['kerb_weight'] = pd.to_numeric(Kaggle_data['kerb_weight'], errors='coerce')
medweight = Kaggle_data['kerb_weight'].median()
Kaggle_data['kerb_weight'].fillna(medweight, inplace=True)

# Width - replace range of width to average of top and bottom ends of range
widthdict = {'2042-2048': '2045'}
Kaggle_data['width'].replace(widthdict, inplace=True)
modewidth = Kaggle_data['width'].mode()
Kaggle_data['width'].fillna(modewidth, inplace=True)

# Height - replace range of height to average of top and bottom ends of range
heightdict = {'1304-1320': '1312'}
Kaggle_data['height'].replace(heightdict, inplace=True)
medHeight = Kaggle_data['height'].median()
Kaggle_data['height'].fillna(medHeight, inplace=True)
Kaggle_data['height'] = Kaggle_data['height'].astype(float)

# Front track - replace range of front track to average of top and bottom ends of range
ftrack = {'1372-1398': '1385'}
Kaggle_data['front_track'].replace(ftrack, inplace=True)
Kaggle_data['front_track'] = Kaggle_data['front_track'].astype(float)

# Rear track - replace range of rear track to average of top and bottom ends of range
rtrack = {'1357-1408': '1383'}
Kaggle_data['rear_track'].replace(rtrack, inplace=True)
Kaggle_data['rear_track'] = Kaggle_data['rear_track'].astype(float)

#maximum_speed 12 - median
MaxspeedMed = Kaggle_data['maximum_speed'].median()
Kaggle_data['maximum_speed'].fillna(MaxspeedMed, inplace=True)


#power_per_litre 2 - mode
ModePPL = Kaggle_data['power_per_litre'].mode()
Kaggle_data['power_per_litre'].fillna(ModePPL, inplace=True)


#torque 4 - mode
TorqueMode = Kaggle_data['torque_NM'].mode()
Kaggle_data['torque_NM'].fillna(TorqueMode, inplace=True)


#engine_displacement 2 - mode
engdisp = Kaggle_data['engine_displacement'].mode()
Kaggle_data['engine_displacement'].fillna(engdisp, inplace=True)


#cylinder_bore - 71 - mean
cylbore = Kaggle_data['cylinder_bore'].mean()
Kaggle_data['cylinder_bore'].fillna(cylbore, inplace=True)

#piston_stroke 71 - mean
pstroke = Kaggle_data['piston_stroke'].mean()
Kaggle_data['piston_stroke'].fillna(pstroke, inplace=True)

#fuel_tank_capacity 32 - heavily skewed - use median
FTCMed = Kaggle_data['fuel_tank_capacity'].median()
Kaggle_data['fuel_tank_capacity'].fillna(FTCMed, inplace=True)

#drag_coefficient          116 - replace drag coefficient values with mode - Pandas profiling shows values center around 0.3
Dragcoeff = Kaggle_data['drag_coefficient'].mode()
Kaggle_data['drag_coefficient'].fillna(Dragcoeff, inplace=True)

#performance ratios
features_to_impute = ['weighttotorque_kgpernm', 'weighttotorque_nmpertonne', 'torque_NM', 'torque_RPM']

for feature in features_to_impute:
    Kaggle_data[feature].fillna(Kaggle_data[feature].median(), inplace=True)

from sklearn.impute import KNNImputer
othmissingnos = ['engine_oil_capacity', 'front_track', 'rear_track','drag_coefficient','compression_ratio',
                'WeighttoPower_kgperhp','WeighttoPower_hppertonne']
knn_imputer = KNNImputer(n_neighbors=5)
Kaggle_data[othmissingnos] = knn_imputer.fit_transform(Kaggle_data[othmissingnos])

#preparing body types for one hot encoding for clustering ML 
body_type_mapping = {'coupe': 0, 'cabriolet': 1, 'Targa': 2}
Kaggle_data['body_type_encoded'] = Kaggle_data['body_type'].map(body_type_mapping)

In [None]:
# Pandas Profiling

from pandas_profiling import ProfileReport
report = ProfileReport(Kaggle_data)
report

# Data transformation - conversions to imperial units 

Let's convert our units to imperial units so that we can better address an American audience. 

In [None]:
kmhtomph = 1.609344  # mph = km/h ÷ 1.609344
mmtoin = 25.4  # inches = millimeters ÷ 25.4
kgtolbs = 2.2046  # lbs = kg * 2.2046
mettontotons = 1.102311  # metric tons × 1.102311 
nmtoftlbs = 0.7376  # 1 Nm is equal to 0.7376 ft-lbs.
ccmtocin = 0.061024  # cubic centimeters × 0.061024

columns_to_convert = ['maximum_speed', 'wheelbase', 'WeighttoPower_kgperhp', 
                      'WeighttoPower_hppertonne', 'weighttotorque_kgpernm', 
                      'weighttotorque_nmpertonne', 'torque_NM', 'kerb_weight', 
                      'length', 'width', 'height', 'front_track', 'rear_track', 
                      'cylinder_bore', 'piston_stroke', 'engine_displacement']

for column in columns_to_convert:
    Kaggle_data[column] = pd.to_numeric(Kaggle_data[column], errors='coerce')


# Maximum speed (km/h to mph) 
Kaggle_data['max_mph'] = Kaggle_data['maximum_speed'] / kmhtomph 

# Wheelbase (mm to inches) 
Kaggle_data['wheelb_in_inches'] = Kaggle_data['wheelbase'] / mmtoin 

# Weight to power ratio conversion: kg/Hp to lbs/Hp and Hp/tonne to Hp/ton
Kaggle_data['WTPow_lbsperhp'] = Kaggle_data['WeighttoPower_kgperhp'] * kgtolbs
Kaggle_data['WTPow_hpperton'] = Kaggle_data['WeighttoPower_hppertonne'] * mettontotons

# Weight to torque ratio conversion: kg/Nm to lbs/ft-lbs and Nm/tonne to ft-lbs/ton
Kaggle_data['weighttotorque_lbsperftlbs'] = Kaggle_data['weighttotorque_kgpernm'] * kgtolbs / nmtoftlbs
Kaggle_data['weighttotorque_ftlbsperton'] = Kaggle_data['weighttotorque_nmpertonne'] * mettontotons * nmtoftlbs

# Torque (Nm to ft-lbs)
Kaggle_data['ftlbstorque'] = Kaggle_data['torque_NM'] * nmtoftlbs

# Kerb weight (kg to lbs)
Kaggle_data['kerb_weight_lbs'] = Kaggle_data['kerb_weight'] * kgtolbs

# Length (mm to inches)
Kaggle_data['length_inin'] = Kaggle_data['length'] / mmtoin 

# Width (mm to inches)
Kaggle_data['width_inin'] = Kaggle_data['width'] / mmtoin 

# Height (mm to inches) 
Kaggle_data['height_inin'] = Kaggle_data['height'] / mmtoin 

# Front track (mm to inches)
Kaggle_data['FT_inin'] = Kaggle_data['front_track'] / mmtoin 

# Rear track (mm to inches) 
Kaggle_data['RT_inin'] = Kaggle_data['rear_track'] / mmtoin 

# Cylinder bore (mm to inches) 
Kaggle_data['cylbore_inin'] = Kaggle_data['cylinder_bore'] / mmtoin 

# Piston stroke (mm to inches) 
Kaggle_data['pistrk_inin'] = Kaggle_data['piston_stroke'] / mmtoin 

# Engine displacement (cm^3 to cubic inches)
Kaggle_data['engine_displacement_incin'] = Kaggle_data['engine_displacement'] * ccmtocin

**Visualize how weight to power and weight to torque affect acceleration times and max speed times** 




In [None]:
#Weight to Torque and acceleration time
import seaborn as sns
sns.lineplot(data=Kaggle_data, x="weighttotorque_kgpernm", y="acceleration_0-62mph", hue="body_type")

In [None]:
#Weight to Torque and max speed

sns.lineplot(data=Kaggle_data, x="weighttotorque_kgpernm", y="maximum_speed", hue="body_type")

In [None]:
#Weight to Power and acceleration time
sns.lineplot(data=Kaggle_data, x="WeighttoPower_kgperhp", y="acceleration_0-62mph", hue="body_type")

In [None]:
#Weight to Power and max speed

sns.lineplot(data=Kaggle_data, x="WeighttoPower_kgperhp", y="maximum_speed", hue="body_type")

In [None]:
#Weight to Power and Weight to Torque

sns.lineplot(data=Kaggle_data, x="WeighttoPower_kgperhp", y="weighttotorque_kgpernm", hue="body_type")

In [None]:
plt.figure(figsize=(14, 6))


plt.subplot(1, 2, 1)
sns.boxenplot(data=Kaggle_data, x='body_type', y='weighttotorque_lbsperftlbs')
plt.title('Distribution of Weight-to-Torque Ratio (lbs/ft-lbs) by Body Type')
plt.xlabel('Body Type')
plt.ylabel('Weight-to-Torque Ratio (lbs/ft-lbs)')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.boxenplot(data=Kaggle_data, x='body_type', y='WTPow_lbsperhp')
plt.title('Distribution of Weight-to-Power Ratio (ft-lbs/ton) by Body Type')
plt.xlabel('Body Type')
plt.ylabel('Weight-to-Power Ratio (ft-lbs/ton)')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Visualizations of relationships between factors

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Kaggle_data.replace([np.inf, -np.inf], np.nan, inplace=True)

Kaggle_data['kerb_weight_lbs'].fillna(Kaggle_data['kerb_weight_lbs'].median(), inplace=True)
Kaggle_data['drive_wheel'].fillna('Unknown', inplace=True)  
Kaggle_data['body_type'].fillna('Unknown', inplace=True)  

ax = sns.swarmplot(data=Kaggle_data, x="kerb_weight_lbs", y="drive_wheel", hue="body_type")
ax.set(xlabel="Kerb weight (in lbs)")
ax.set(ylabel="Drive Wheel")
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
corr_matrix = Kaggle_data[['weighttotorque_lbsperftlbs', 'weighttotorque_ftlbsperton', 'acceleration_0-62mph', 'max_mph', 'kerb_weight']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='vlag', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Selected Features')
plt.show()

**Drive wheel vs. Kerb weight:** From this visualization, we can conclude that The Porsche 911 has primarily been a rear wheel drive coupe. What is interesting to note, is how the shape of the data points of rear wheel drive models and all-wheel drive models appear to be nearly identical between 2,900 lbs and 3,400 across all body types and drive wheel formats. 

**Performance correlation heat map:** There are a few interesting correlations that I noticed from this visualization: Kerb  weight and max_mph are highly correlated whereas acceleration and kerb weight have a moderately inverse relationship. It also appears that acceleration time and max speed are also inversely related. 

# Using K Means Clustering to predict body type 

Use Physical features to determine body type

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score


# Select features for machine learning - let's see how well our model can distinguish between 3 different body styles.
ML_Features = Kaggle_data[['start_of_production', 'body_type_encoded', 'acceleration_0-62mph',
                           'max_mph', 'power_per_litre', 'engine_displacement_incin', 'cylbore_inin',
                           'pistrk_inin', 'kerb_weight_lbs', 'fuel_tank_capacity', 'length_inin', 'width_inin',
                           'height_inin', 'wheelb_in_inches', 'drive_wheel', 'compression_ratio', 'engine_oil_capacity',
                           'FT_inin', 'RT_inin', 'drag_coefficient', 'WTPow_lbsperhp', 'WTPow_hpperton',
                           'weighttotorque_lbsperftlbs', 'weighttotorque_ftlbsperton', 'Power_HP', 'Power_RPM',
                           'ftlbstorque', 'torque_RPM']]

ML_Features = ML_Features.dropna()

# One-hot encoding 'drive_wheel' (categorical variable)
ML_Features = pd.get_dummies(ML_Features, columns=['drive_wheel'])


scaler = StandardScaler()
scaled_MLFeat = scaler.fit_transform(ML_Features)

k_values = range(2, 11)
wcss = []
silhouette_scores = []
calinski_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_MLFeat)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(scaled_MLFeat, kmeans.labels_))
    calinski_scores.append(calinski_harabasz_score(scaled_MLFeat, kmeans.labels_))

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(k_values, wcss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('WCSS vs. Number of clusters')

plt.subplot(1, 3, 2)
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of clusters')

plt.subplot(1, 3, 3)
plt.plot(k_values, calinski_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Score')
plt.title('Calinski-Harabasz Score vs. Number of clusters')

plt.tight_layout()
plt.show()

**Interpretation of clustering metrics** - without eliminating features

1. **WCSS vs. Number of clusters:** With 10 clusters, we have the lowest residuals from the center value. 
2. **Silhouette score vs. Number of clusters:** The best formed clusters are with 4 clusters. Possibly being indicative of the model separating between the early years of the 911 and more recent years.  
3. **Calinkski-Harabasz score vs. Number of clusters:** Four clusters produces far tighter clusters than three clusters.

In [None]:
#Let's see if there are any features worth dropping with principle component analysis.

from sklearn.decomposition import PCA

n_components = min(len(ML_Features), len(ML_Features.columns))

pca = PCA(n_components=n_components)
pca.fit(scaled_MLFeat)

plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA - Explained Variance')
plt.show()

pca_components = pd.DataFrame(pca.components_, columns=ML_Features.columns)
pca_components.head()

In [None]:
variance_threshold = 0.90
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
num_components = np.where(cumulative_variance >= variance_threshold)[0][0] + 1

print(f"Number of components explaining {variance_threshold*100}% variance: {num_components}")

pca = PCA(n_components=num_components)
reduced_features = pca.fit_transform(scaled_MLFeat)

k_values = range(2, 11)
wcss = []
silhouette_scores = []
calinski_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(reduced_features)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(reduced_features, kmeans.labels_))
    calinski_scores.append(calinski_harabasz_score(reduced_features, kmeans.labels_))

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(k_values, wcss, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('WCSS vs. Number of clusters')

plt.subplot(1, 3, 2)
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of clusters')

plt.subplot(1, 3, 3)
plt.plot(k_values, calinski_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Calinski-Harabasz Score')
plt.title('Calinski-Harabasz Score vs. Number of clusters')

plt.tight_layout()
plt.show()


In [None]:
# comparing 3 and 4 clusters

def plot_clusters(data, labels, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='viridis', marker='o')
    plt.title(title)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.colorbar()
    plt.show()

kmeans_3 = KMeans(n_clusters=3, random_state=42)
labels_3 = kmeans_3.fit_predict(scaled_MLFeat)
plot_clusters(reduced_features, labels_3, 'Clusters with k=3')

kmeans_4 = KMeans(n_clusters=4, random_state=42)
labels_4 = kmeans_4.fit_predict(scaled_MLFeat)
plot_clusters(reduced_features, labels_4, 'Clusters with k=4')

# Using Linear Regression to predict top speed


In [None]:
LR_Features = Kaggle_data[['start_of_production', 'body_type_encoded', 'acceleration_0-62mph',
                           'max_mph', 'power_per_litre', 'engine_displacement_incin', 'cylbore_inin',
                           'pistrk_inin', 'kerb_weight_lbs', 'fuel_tank_capacity', 'length_inin', 'width_inin',
                           'height_inin', 'wheelb_in_inches', 'drive_wheel', 'compression_ratio', 'engine_oil_capacity',
                           'FT_inin', 'RT_inin', 'drag_coefficient', 'WTPow_lbsperhp', 'WTPow_hpperton',
                           'weighttotorque_lbsperftlbs', 'weighttotorque_ftlbsperton', 'Power_HP', 'Power_RPM',
                           'ftlbstorque', 'torque_RPM']]

#See datatypes in ML_Features
LR_Features.dtypes 

In [None]:
LR_Features.describe()

In [None]:
# Select features for machine learning
ML_Features = Kaggle_data[['start_of_production', 'body_type_encoded', 'acceleration_0-62mph',
                           'max_mph', 'power_per_litre', 'engine_displacement_incin', 'cylbore_inin',
                           'pistrk_inin', 'kerb_weight_lbs', 'fuel_tank_capacity', 'length_inin', 'width_inin',
                           'height_inin', 'wheelb_in_inches', 'drive_wheel', 'compression_ratio', 'engine_oil_capacity',
                           'FT_inin', 'RT_inin', 'drag_coefficient', 'WTPow_lbsperhp', 'WTPow_hpperton',
                           'weighttotorque_lbsperftlbs', 'weighttotorque_ftlbsperton', 'Power_HP', 'Power_RPM',
                           'ftlbstorque', 'torque_RPM']]

ML_Features = ML_Features.dropna()

plt.figure(figsize=(16, 12))
sns.heatmap(LR_Features.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
LR_Features.dtypes

In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

Features = LR_Features.drop(['max_mph', 'drive_wheel_encoded','drive_wheel_All wheel drive (4x4)','drive_wheel_Rear wheel drive'
                            ], axis=1)
Target = LR_Features['max_mph']

Features = Features.apply(pd.to_numeric, errors='coerce')

Features = Features.dropna()
Target = Target[Features.index]  
x_train, x_test, y_train, y_test = train_test_split(Features, Target, train_size=0.7, random_state=50)

sm_lr = sm.OLS(y_train, sm.add_constant(x_train))
res = sm_lr.fit()

print(res.summary())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


Features = LR_Features.drop(['max_mph', 'drive_wheel_encoded', 'drive_wheel_All wheel drive (4x4)', 'drive_wheel_Rear wheel drive'], axis=1)
Target = LR_Features['max_mph']

Features = Features.apply(pd.to_numeric, errors='coerce')

Features = Features.dropna()
Target = Target[Features.index]  

x_train, x_test, y_train, y_test = train_test_split(Features, Target, train_size=0.7, random_state=50)

lasso = LassoCV(cv=5, random_state=50)
lasso.fit(x_train, y_train)

y_pred_lasso = lasso.predict(x_test)

ridge = RidgeCV(cv=5)
ridge.fit(x_train, y_train)
y_pred_ridge = ridge.predict(x_test)


print("Lasso Coefficients:", lasso.coef_)
print("Ridge Coefficients:", ridge.coef_)

print("Lasso R^2 Score:", r2_score(y_test, y_pred_lasso))
print("Lasso Mean Squared Error:", mean_squared_error(y_test, y_pred_lasso))
print("Lasso Mean Absolute Error:", mean_absolute_error(y_test, y_pred_lasso))

print("Ridge R^2 Score:", r2_score(y_test, y_pred_ridge))
print("Ridge Mean Squared Error:", mean_squared_error(y_test, y_pred_ridge))
print("Ridge Mean Absolute Error:", mean_absolute_error(y_test, y_pred_ridge))




In [None]:
coefficients_lasso = pd.Series(lasso.coef_, index=x_train.columns)
coefficients_ridge = pd.Series(ridge.coef_, index=x_train.columns)

plt.figure(figsize=(10, 6))
coefficients_lasso.sort_values().plot(kind='barh', alpha=0.7, color='blue', label='Lasso')
coefficients_ridge.sort_values().plot(kind='barh', alpha=0.7, color='red', label='Ridge')
plt.title('Lasso and Ridge Coefficients')
plt.xlabel('Coefficient Value')
plt.ylabel('Features')
plt.legend()
plt.show()


plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred_lasso, alpha=0.75, edgecolors='k', label='Lasso Predictions')
plt.scatter(y_test, y_pred_ridge, alpha=0.75, edgecolors='r', label='Ridge Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
plt.title('Predicted vs Actual Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.show()