<a href="https://colab.research.google.com/github/Suryaprasadindra/research/blob/main/research_with_k_folds_with_A_approach_%26_with_new_k_fold_strategy_with_B_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

# Load your dataset
data = pd.read_csv('/content/52 CO2_Emissions_Canada (1).csv')  # Make sure to change this to your actual file path

# Preliminary preprocessing steps
data.drop_duplicates(inplace=True)

# Defining categorical and numerical features
categorical_features = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']
numerical_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
                      'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
                      'Fuel Consumption Comb (mpg)']

# Preprocessor for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='drop')

# Split data into features and target
X = data.drop('CO2 Emissions(g/km)', axis=1)
y = data['CO2 Emissions(g/km)']

# Approach A: Linear Regression on preprocessed data
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_A_transformed = preprocessor.fit_transform(X_train_A)
X_test_A_transformed = preprocessor.transform(X_test_A)

model_A = LinearRegression()
model_A.fit(X_train_A_transformed, y_train_A)
y_pred_A = model_A.predict(X_test_A_transformed)

mse_A = mean_squared_error(y_test_A, y_pred_A)
rmse_A = np.sqrt(mse_A)
r2_score_A = r2_score(y_test_A, y_pred_A)

print(f"Approach A - MSE: {mse_A}, RMSE: {rmse_A}, R²: {r2_score_A}")

# Approach B: k-means clustering then Linear Regression
# Function to perform clustering, training, and evaluation
def cluster_and_train(X, y, k):
    X_transformed = preprocessor.fit_transform(X)
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X_transformed)
    clusters = kmeans.labels_

    # Selecting samples for training from each cluster
    training_indices = []
    np.random.seed(42)
    for i in range(k):
        cluster_indices = np.where(clusters == i)[0]
        training_sample = np.random.choice(cluster_indices, size=int(len(cluster_indices) * 0.7), replace=False)
        training_indices.extend(training_sample)

    # Creating training and test datasets
    X_train_B = X_transformed[training_indices]
    y_train_B = y.iloc[training_indices]
    test_indices = list(set(range(X_transformed.shape[0])) - set(training_indices))
    X_test_B = X_transformed[test_indices]
    y_test_B = y.iloc[test_indices]

    # Training and evaluating the model
    model_B = LinearRegression()
    model_B.fit(X_train_B, y_train_B)
    y_pred_B = model_B.predict(X_test_B)

    mse_B = mean_squared_error(y_test_B, y_pred_B)
    rmse_B = np.sqrt(mse_B)
    r2_score_B = r2_score(y_test_B, y_pred_B)

    return mse_B, rmse_B, r2_score_B

# Example usage of Approach B with k=5
mse_B, rmse_B, r2_score_B = cluster_and_train(X, y, 5)
print(f"Approach B with k=5 - MSE: {mse_B}, RMSE: {rmse_B}, R²: {r2_score_B}")

# You can loop through different k values to find the best performing model
k_values = [2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
for k in k_values:
    mse_B, rmse_B, r2_score_B = cluster_and_train(X, y, k)
    print(f"Approach B with k={k} - MSE: {mse_B}, RMSE: {rmse_B}, R²: {r2_score_B}")


Approach A - MSE: 31.754488232790134, RMSE: 5.6351120869766325, R²: 0.9912006271036701




Approach B with k=5 - MSE: 31.604736321259793, RMSE: 5.621808990108059, R²: 0.9912207966405573




Approach B with k=2 - MSE: 37.410911437335585, RMSE: 6.116445980905544, R²: 0.9891382728992398




Approach B with k=3 - MSE: 31.185156407454013, RMSE: 5.584367144758125, R²: 0.9909891770790025




Approach B with k=4 - MSE: 30.475277372716622, RMSE: 5.520441773329071, R²: 0.9913111299496573




Approach B with k=5 - MSE: 31.604736321259793, RMSE: 5.621808990108059, R²: 0.9912207966405573




Approach B with k=6 - MSE: 33.35686112442519, RMSE: 5.775539898955351, R²: 0.9909369604963032




Approach B with k=7 - MSE: 32.90400725755271, RMSE: 5.7362014659138945, R²: 0.9905346941659998




Approach B with k=8 - MSE: 30.096165449297864, RMSE: 5.48599721557511, R²: 0.9914471879953244




Approach B with k=9 - MSE: 31.193862459422213, RMSE: 5.585146592473846, R²: 0.9913611416199818




Approach B with k=10 - MSE: 37.29333812763576, RMSE: 6.106827173552217, R²: 0.9893275804068892




Approach B with k=11 - MSE: 32.12863718632476, RMSE: 5.668212874118681, R²: 0.9908102148582631




Approach B with k=12 - MSE: 29.938940385302665, RMSE: 5.4716487812452534, R²: 0.9916568667816666




Approach B with k=13 - MSE: 28.770952244978016, RMSE: 5.363856098459206, R²: 0.9916771019813561




Approach B with k=14 - MSE: 28.6943815678096, RMSE: 5.356713691043194, R²: 0.991616494201934




Approach B with k=15 - MSE: 28.72844666555683, RMSE: 5.359892411752015, R²: 0.991997374198427




Approach B with k=16 - MSE: 34.290561245905856, RMSE: 5.855814311084826, R²: 0.9903791030126484




Approach B with k=17 - MSE: 29.43461119322476, RMSE: 5.425367378641262, R²: 0.9916297272320092




Approach B with k=18 - MSE: 29.056519304130262, RMSE: 5.390409938411945, R²: 0.9916071549564425




Approach B with k=19 - MSE: 30.826771541373894, RMSE: 5.5521861947681375, R²: 0.991528158193866




Approach B with k=20 - MSE: 36.49218704229128, RMSE: 6.0408763472108316, R²: 0.9896637634565009




Approach B with k=21 - MSE: 33.7516001620269, RMSE: 5.809612737698348, R²: 0.9904871564975297




Approach B with k=22 - MSE: 34.61001763467291, RMSE: 5.883027930808497, R²: 0.9904445584967598




Approach B with k=23 - MSE: 29.772053080640596, RMSE: 5.4563772854010555, R²: 0.9916972875587894




Approach B with k=24 - MSE: 32.92731038622916, RMSE: 5.738232339861219, R²: 0.9907876014204129




Approach B with k=25 - MSE: 30.93264953511107, RMSE: 5.561712823862004, R²: 0.9909671038155501




Approach B with k=26 - MSE: 34.112939101757746, RMSE: 5.840628313953709, R²: 0.9903962857491072




Approach B with k=27 - MSE: 34.22037709399834, RMSE: 5.849818552228636, R²: 0.9902707190379556




Approach B with k=28 - MSE: 34.35171136646485, RMSE: 5.861033301941292, R²: 0.9904243562076069




Approach B with k=29 - MSE: 35.194496688620006, RMSE: 5.932494980075416, R²: 0.9899890973052615




Approach B with k=30 - MSE: 34.3704999270403, RMSE: 5.862635919707133, R²: 0.9908098237722889


# **applying k-folds**

In [13]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline

# Define the model pipeline
pipeline_A = make_pipeline(preprocessor, LinearRegression())

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
scores_A = cross_val_score(pipeline_A, X, y, cv=kf, scoring='neg_mean_squared_error')

# Calculate average MSE and RMSE across all folds
average_mse_A = -scores_A.mean()
average_rmse_A = np.sqrt(average_mse_A)

print(f"Approach A - Average MSE: {average_mse_A}, Average RMSE: {average_rmse_A}")


Approach A - Average MSE: 24.396251025108608, Average RMSE: 4.93925612062268


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans

# Load your dataset
data = pd.read_csv('/content/52 CO2_Emissions_Canada (1).csv')  # Make sure to adjust this to your actual file path

# Split data into features (X) and target (y)
X = data.drop('CO2 Emissions(g/km)', axis=1)
y = data['CO2 Emissions(g/km)']

# Defining categorical and numerical features based on your dataset
categorical_features = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']
numerical_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
                      'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
                      'Fuel Consumption Comb (mpg)']

# Setup the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='drop')

# Approach A: Linear Regression with preprocessed data and k-fold cross-validation
pipeline_A = make_pipeline(preprocessor, LinearRegression())

# Perform k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_A = cross_val_score(pipeline_A, X, y, cv=kf, scoring='neg_mean_squared_error')

# Calculate average MSE and RMSE across all folds for Approach A
average_mse_A = -scores_A.mean()
average_rmse_A = np.sqrt(average_mse_A)
print(f"Approach A - Average MSE: {average_mse_A}, Average RMSE: {average_rmse_A}")



Approach A - Average MSE: 24.396251025108608, Average RMSE: 4.93925612062268


In [19]:
def approach_B_with_kfold(X, y, k_clusters, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    mse_scores = []
    fold_number = 1  # To track the fold number

    for train_index, test_index in kf.split(X):
        print(f"Processing fold {fold_number}/{n_splits}")
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Preprocessing
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)

        # KMeans Clustering
        kmeans = KMeans(n_clusters=k_clusters, random_state=42)
        clusters = kmeans.fit_predict(X_train_transformed)

        sampled_indices = get_sampled_indices(clusters, proportion=0.7, k_clusters=k_clusters)

        X_train_sampled = X_train_transformed[sampled_indices]
        y_train_sampled = y_train.iloc[sampled_indices].values

        # Linear Regression
        model_B = LinearRegression()
        model_B.fit(X_train_sampled, y_train_sampled)
        y_pred_B = model_B.predict(X_test_transformed)

        mse = mean_squared_error(y_test, y_pred_B)
        mse_scores.append(mse)
        print(f"Fold {fold_number} MSE: {mse}")

        fold_number += 1

    average_mse_B = np.mean(mse_scores)
    average_rmse_B = np.sqrt(average_mse_B)

    print(f"Final Average MSE: {average_mse_B}, Average RMSE: {average_rmse_B}")
    return average_mse_B, average_rmse_B

def get_sampled_indices(clusters, proportion, k_clusters):
    sampled_indices = []
    for cluster in range(k_clusters):
        cluster_indices = np.where(clusters == cluster)[0]
        sampled_cluster_indices = np.random.choice(cluster_indices, size=int(len(cluster_indices) * proportion), replace=False)
        sampled_indices.extend(sampled_cluster_indices)
    return np.array(sampled_indices)


In [27]:
average_mse_B, average_rmse_B = approach_B_with_kfold(X, y, k_clusters=6, n_splits=5)


Processing fold 1/5




Fold 1 MSE: 33.00189277952316
Processing fold 2/5




Fold 2 MSE: 22.308176763214643
Processing fold 3/5




Fold 3 MSE: 30.9338298864135
Processing fold 4/5




Fold 4 MSE: 27.53429054954948
Processing fold 5/5




Fold 5 MSE: 36.91139092490228
Final Average MSE: 30.137916180720612, Average RMSE: 5.489801105752431


In [29]:
# Define a range of k values to test
k_values = range(2, 30)  # For example, testing k values from 2 to 15

# Initialize a dictionary to store results for each k
results_for_k = {}

# Loop over each k value
for k in k_values:
    print(f"Processing k={k}")
    average_mse_B, average_rmse_B = approach_B_with_kfold(X, y, k_clusters=k, n_splits=5)
    results_for_k[k] = {'Average MSE': average_mse_B, 'Average RMSE': average_rmse_B}
    print(f"Results for k={k}: MSE - {average_mse_B}, RMSE - {average_rmse_B}\n")

# After the loop, results_for_k will hold the MSE and RMSE for each k value
print("Final Results Across Different k Values:")
for k, metrics in results_for_k.items():
    print(f"k={k}: {metrics}")


Processing k=2
Processing fold 1/5




Fold 1 MSE: 30.980053065875516
Processing fold 2/5




Fold 2 MSE: 28.427586683918445
Processing fold 3/5




Fold 3 MSE: 26.067076065620856
Processing fold 4/5




Fold 4 MSE: 25.408511708060793
Processing fold 5/5




Fold 5 MSE: 27.74508441306826
Final Average MSE: 27.725662387308773, Average RMSE: 5.265516345745095
Results for k=2: MSE - 27.725662387308773, RMSE - 5.265516345745095

Processing k=3
Processing fold 1/5




Fold 1 MSE: 32.03676396028341
Processing fold 2/5




Fold 2 MSE: 24.841628182514363
Processing fold 3/5




Fold 3 MSE: 26.701422287195502
Processing fold 4/5




Fold 4 MSE: 26.0981884280837
Processing fold 5/5




Fold 5 MSE: 26.34709030570464
Final Average MSE: 27.20501863275632, Average RMSE: 5.215843041422577
Results for k=3: MSE - 27.20501863275632, RMSE - 5.215843041422577

Processing k=4
Processing fold 1/5




Fold 1 MSE: 32.53271271820495
Processing fold 2/5




Fold 2 MSE: 24.918866032980198
Processing fold 3/5




Fold 3 MSE: 31.505083854996343
Processing fold 4/5




Fold 4 MSE: 25.12295682761137
Processing fold 5/5




Fold 5 MSE: 25.016433651280096
Final Average MSE: 27.81921061701459, Average RMSE: 5.274391966569662
Results for k=4: MSE - 27.81921061701459, RMSE - 5.274391966569662

Processing k=5
Processing fold 1/5




Fold 1 MSE: 34.23567080292542
Processing fold 2/5




Fold 2 MSE: 23.907864719933926
Processing fold 3/5




Fold 3 MSE: 32.42069873548386
Processing fold 4/5




Fold 4 MSE: 31.01775636186664
Processing fold 5/5




Fold 5 MSE: 24.08102193878971
Final Average MSE: 29.13260251179991, Average RMSE: 5.397462599388708
Results for k=5: MSE - 29.13260251179991, RMSE - 5.397462599388708

Processing k=6
Processing fold 1/5




Fold 1 MSE: 32.13647164725514
Processing fold 2/5




Fold 2 MSE: 23.879214495508897
Processing fold 3/5




Fold 3 MSE: 29.126306984080763
Processing fold 4/5




Fold 4 MSE: 25.16250754613327
Processing fold 5/5




Fold 5 MSE: 28.995678397170398
Final Average MSE: 27.860035814029693, Average RMSE: 5.278260680757412
Results for k=6: MSE - 27.860035814029693, RMSE - 5.278260680757412

Processing k=7
Processing fold 1/5




Fold 1 MSE: 33.820864379029366
Processing fold 2/5




Fold 2 MSE: 23.4888895710528
Processing fold 3/5




Fold 3 MSE: 29.496915957158098
Processing fold 4/5




Fold 4 MSE: 29.17995380739007
Processing fold 5/5




Fold 5 MSE: 24.99000677038705
Final Average MSE: 28.195326097003477, Average RMSE: 5.309927127278065
Results for k=7: MSE - 28.195326097003477, RMSE - 5.309927127278065

Processing k=8
Processing fold 1/5




Fold 1 MSE: 33.14180613988263
Processing fold 2/5




Fold 2 MSE: 24.800898946883194
Processing fold 3/5




Fold 3 MSE: 30.415871400109047
Processing fold 4/5




Fold 4 MSE: 26.675531298524103
Processing fold 5/5




Fold 5 MSE: 26.678346421241265
Final Average MSE: 28.34249084132805, Average RMSE: 5.3237666028224835
Results for k=8: MSE - 28.34249084132805, RMSE - 5.3237666028224835

Processing k=9
Processing fold 1/5




Fold 1 MSE: 33.41292730665722
Processing fold 2/5




Fold 2 MSE: 24.794744138777517
Processing fold 3/5




Fold 3 MSE: 27.721331751431084
Processing fold 4/5




Fold 4 MSE: 26.945390371879707
Processing fold 5/5




Fold 5 MSE: 31.186369156953102
Final Average MSE: 28.812152545139725, Average RMSE: 5.367695273126048
Results for k=9: MSE - 28.812152545139725, RMSE - 5.367695273126048

Processing k=10
Processing fold 1/5




Fold 1 MSE: 33.20985170150059
Processing fold 2/5




Fold 2 MSE: 22.827600119883446
Processing fold 3/5




Fold 3 MSE: 33.47819848964701
Processing fold 4/5




Fold 4 MSE: 27.891953107276553
Processing fold 5/5




Fold 5 MSE: 26.885471591324052
Final Average MSE: 28.85861500192633, Average RMSE: 5.372021500508568
Results for k=10: MSE - 28.85861500192633, RMSE - 5.372021500508568

Processing k=11
Processing fold 1/5




Fold 1 MSE: 39.634516194837296
Processing fold 2/5




Fold 2 MSE: 26.173349526213237
Processing fold 3/5




Fold 3 MSE: 30.907071470477028
Processing fold 4/5




Fold 4 MSE: 28.971704897403953
Processing fold 5/5




Fold 5 MSE: 28.64823129835625
Final Average MSE: 30.866974677457552, Average RMSE: 5.555805493126766
Results for k=11: MSE - 30.866974677457552, RMSE - 5.555805493126766

Processing k=12
Processing fold 1/5




Fold 1 MSE: 31.771789839138904
Processing fold 2/5




Fold 2 MSE: 23.29509362684599
Processing fold 3/5




Fold 3 MSE: 28.17769153307797
Processing fold 4/5




Fold 4 MSE: 27.371387067301853
Processing fold 5/5




Fold 5 MSE: 27.151769589962132
Final Average MSE: 27.55354633126537, Average RMSE: 5.249147200380779
Results for k=12: MSE - 27.55354633126537, RMSE - 5.249147200380779

Processing k=13
Processing fold 1/5




Fold 1 MSE: 32.7949053909643
Processing fold 2/5




Fold 2 MSE: 27.5264654567935
Processing fold 3/5




Fold 3 MSE: 28.833699581663232
Processing fold 4/5




Fold 4 MSE: 31.955550605440592
Processing fold 5/5




Fold 5 MSE: 24.451770302919332
Final Average MSE: 29.112478267556195, Average RMSE: 5.395598045402956
Results for k=13: MSE - 29.112478267556195, RMSE - 5.395598045402956

Processing k=14
Processing fold 1/5




Fold 1 MSE: 33.002924261727486
Processing fold 2/5




Fold 2 MSE: 23.594771438395092
Processing fold 3/5




Fold 3 MSE: 27.90170281705718
Processing fold 4/5




Fold 4 MSE: 27.92308150313482
Processing fold 5/5




Fold 5 MSE: 26.485874493668863
Final Average MSE: 27.78167090279669, Average RMSE: 5.270832088275692
Results for k=14: MSE - 27.78167090279669, RMSE - 5.270832088275692

Processing k=15
Processing fold 1/5




Fold 1 MSE: 34.2462225833624
Processing fold 2/5




Fold 2 MSE: 21.204818858774107
Processing fold 3/5




Fold 3 MSE: 27.828395283790087
Processing fold 4/5




Fold 4 MSE: 30.641069064253397
Processing fold 5/5




Fold 5 MSE: 25.82948540958419
Final Average MSE: 27.94999823995284, Average RMSE: 5.286775788697005
Results for k=15: MSE - 27.94999823995284, RMSE - 5.286775788697005

Processing k=16
Processing fold 1/5




Fold 1 MSE: 36.083083356003115
Processing fold 2/5




Fold 2 MSE: 21.886002355706808
Processing fold 3/5




Fold 3 MSE: 29.174497831865512
Processing fold 4/5




Fold 4 MSE: 27.022149298867905
Processing fold 5/5




Fold 5 MSE: 25.908464535087955
Final Average MSE: 28.014839475506257, Average RMSE: 5.292904635028507
Results for k=16: MSE - 28.014839475506257, RMSE - 5.292904635028507

Processing k=17
Processing fold 1/5




Fold 1 MSE: 32.482443201473856
Processing fold 2/5




Fold 2 MSE: 22.376489565485603
Processing fold 3/5




Fold 3 MSE: 30.87717863418434
Processing fold 4/5




Fold 4 MSE: 29.80688699051738
Processing fold 5/5




Fold 5 MSE: 25.202454235523643
Final Average MSE: 28.149090525436964, Average RMSE: 5.305571649260517
Results for k=17: MSE - 28.149090525436964, RMSE - 5.305571649260517

Processing k=18
Processing fold 1/5




Fold 1 MSE: 32.70418692980977
Processing fold 2/5




Fold 2 MSE: 26.691746373293995
Processing fold 3/5




Fold 3 MSE: 29.432761923137576
Processing fold 4/5




Fold 4 MSE: 29.655818019891175
Processing fold 5/5




Fold 5 MSE: 27.38140771185358
Final Average MSE: 29.173184191597215, Average RMSE: 5.40122062052618
Results for k=18: MSE - 29.173184191597215, RMSE - 5.40122062052618

Processing k=19
Processing fold 1/5




Fold 1 MSE: 35.661263078898855
Processing fold 2/5




Fold 2 MSE: 25.300030708727224
Processing fold 3/5




Fold 3 MSE: 29.97549446535432
Processing fold 4/5




Fold 4 MSE: 31.022846837521715
Processing fold 5/5




Fold 5 MSE: 27.571589918031574
Final Average MSE: 29.906245001706736, Average RMSE: 5.468660256562546
Results for k=19: MSE - 29.906245001706736, RMSE - 5.468660256562546

Processing k=20
Processing fold 1/5




Fold 1 MSE: 33.614450179689946
Processing fold 2/5




Fold 2 MSE: 22.46697323328993
Processing fold 3/5




Fold 3 MSE: 27.573723810014855
Processing fold 4/5




Fold 4 MSE: 25.64194295822562
Processing fold 5/5




Fold 5 MSE: 32.41636564782204
Final Average MSE: 28.342691165808475, Average RMSE: 5.323785416957419
Results for k=20: MSE - 28.342691165808475, RMSE - 5.323785416957419

Processing k=21
Processing fold 1/5




Fold 1 MSE: 33.89886271799884
Processing fold 2/5




Fold 2 MSE: 23.16687986904476
Processing fold 3/5




Fold 3 MSE: 31.101826444414577
Processing fold 4/5




Fold 4 MSE: 30.292229164201053
Processing fold 5/5




Fold 5 MSE: 29.0830415889197
Final Average MSE: 29.508567956915783, Average RMSE: 5.4321789327042405
Results for k=21: MSE - 29.508567956915783, RMSE - 5.4321789327042405

Processing k=22
Processing fold 1/5




Fold 1 MSE: 35.48042308369855
Processing fold 2/5




Fold 2 MSE: 24.18753147544955
Processing fold 3/5




Fold 3 MSE: 26.32640865973438
Processing fold 4/5




Fold 4 MSE: 28.46900269469322
Processing fold 5/5




Fold 5 MSE: 24.90054679790981
Final Average MSE: 27.872782542297102, Average RMSE: 5.27946801697833
Results for k=22: MSE - 27.872782542297102, RMSE - 5.27946801697833

Processing k=23
Processing fold 1/5




Fold 1 MSE: 32.374053242268204
Processing fold 2/5




Fold 2 MSE: 24.372215823452994
Processing fold 3/5




Fold 3 MSE: 28.998029779782538
Processing fold 4/5




Fold 4 MSE: 24.641988942872544
Processing fold 5/5




Fold 5 MSE: 24.363985021899396
Final Average MSE: 26.950054562055136, Average RMSE: 5.1913441960685995
Results for k=23: MSE - 26.950054562055136, RMSE - 5.1913441960685995

Processing k=24
Processing fold 1/5




Fold 1 MSE: 36.50087085698358
Processing fold 2/5




Fold 2 MSE: 26.502932536215358
Processing fold 3/5




Fold 3 MSE: 31.71787258292534
Processing fold 4/5




Fold 4 MSE: 25.337743897269135
Processing fold 5/5




Fold 5 MSE: 28.628216407326992
Final Average MSE: 29.73752725614408, Average RMSE: 5.453212562897588
Results for k=24: MSE - 29.73752725614408, RMSE - 5.453212562897588

Processing k=25
Processing fold 1/5




Fold 1 MSE: 33.41914042204752
Processing fold 2/5




Fold 2 MSE: 23.164097758812
Processing fold 3/5




Fold 3 MSE: 31.33501108938654
Processing fold 4/5




Fold 4 MSE: 25.01330381137651
Processing fold 5/5




Fold 5 MSE: 26.697455236965453
Final Average MSE: 27.925801663717607, Average RMSE: 5.284486887458195
Results for k=25: MSE - 27.925801663717607, RMSE - 5.284486887458195

Processing k=26
Processing fold 1/5




Fold 1 MSE: 33.45613982792179
Processing fold 2/5




Fold 2 MSE: 22.687102885771637
Processing fold 3/5




Fold 3 MSE: 28.243627311694908
Processing fold 4/5




Fold 4 MSE: 24.874201534097384
Processing fold 5/5




Fold 5 MSE: 25.067508810113743
Final Average MSE: 26.86571607391989, Average RMSE: 5.183214839645362
Results for k=26: MSE - 26.86571607391989, RMSE - 5.183214839645362

Processing k=27
Processing fold 1/5




Fold 1 MSE: 34.299720793004305
Processing fold 2/5




Fold 2 MSE: 28.42500654141802
Processing fold 3/5




Fold 3 MSE: 31.265703089010017
Processing fold 4/5




Fold 4 MSE: 25.81201168155022
Processing fold 5/5




Fold 5 MSE: 32.11631412824133
Final Average MSE: 30.383751246644778, Average RMSE: 5.5121457933045255
Results for k=27: MSE - 30.383751246644778, RMSE - 5.5121457933045255

Processing k=28
Processing fold 1/5




Fold 1 MSE: 34.466058419655454
Processing fold 2/5




Fold 2 MSE: 21.865025294285548
Processing fold 3/5




Fold 3 MSE: 28.959209462280334
Processing fold 4/5




Fold 4 MSE: 25.053266018904502
Processing fold 5/5




Fold 5 MSE: 28.378962722431076
Final Average MSE: 27.744504383511384, Average RMSE: 5.267305229765158
Results for k=28: MSE - 27.744504383511384, RMSE - 5.267305229765158

Processing k=29
Processing fold 1/5




Fold 1 MSE: 32.59655879328586
Processing fold 2/5




Fold 2 MSE: 26.15688391073512
Processing fold 3/5




Fold 3 MSE: 33.7125994920955
Processing fold 4/5




Fold 4 MSE: 30.92284431839394
Processing fold 5/5




Fold 5 MSE: 26.141400846834856
Final Average MSE: 29.90605747226906, Average RMSE: 5.468643110705713
Results for k=29: MSE - 29.90605747226906, RMSE - 5.468643110705713

Final Results Across Different k Values:
k=2: {'Average MSE': 27.725662387308773, 'Average RMSE': 5.265516345745095}
k=3: {'Average MSE': 27.20501863275632, 'Average RMSE': 5.215843041422577}
k=4: {'Average MSE': 27.81921061701459, 'Average RMSE': 5.274391966569662}
k=5: {'Average MSE': 29.13260251179991, 'Average RMSE': 5.397462599388708}
k=6: {'Average MSE': 27.860035814029693, 'Average RMSE': 5.278260680757412}
k=7: {'Average MSE': 28.195326097003477, 'Average RMSE': 5.309927127278065}
k=8: {'Average MSE': 28.34249084132805, 'Average RMSE': 5.3237666028224835}
k=9: {'Average MSE': 28.812152545139725, 'Average RMSE': 5.367695273126048}
k=10: {'Average MSE': 28.85861500192633, 'Average RMSE': 5.372021500508568}
k=11: {'Average MSE': 30.866974677457552, 'Average RMSE': 5.555805493126766}
k=12: {'Average MSE': 27.553546

# select different observations from each k-fold than previous k fold
# **approach_B_with_varied_sampling**

---



In [40]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
from scipy import sparse as sp

# Load the dataset (ensure the correct path)
data = pd.read_csv('/content/52 CO2_Emissions_Canada (1).csv')

# Define features and target variable
X = data.drop(columns=['CO2 Emissions(g/km)'])
y = data['CO2 Emissions(g/km)']

# Define the preprocessor
categorical_features = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']  # Update as per your dataset
numerical_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
                      'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
                      'Fuel Consumption Comb (mpg)']  # Update as per your dataset

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop')

# Approach B Function with varied sampling and sparse matrix handling
def approach_B_with_varied_sampling(X, y, k_clusters, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    mse_scores = []

    for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)

        # Ensure handling of sparse matrices
        if sp.issparse(X_train_transformed):
            X_train_transformed = X_train_transformed.toarray()
        if sp.issparse(X_test_transformed):
            X_test_transformed = X_test_transformed.toarray()

        kmeans = KMeans(n_clusters=k_clusters, random_state=fold)
        clusters = kmeans.fit_predict(X_train_transformed)

        sampled_indices = get_varied_sampled_indices(clusters, k_clusters, fold)

        X_train_sampled = X_train_transformed[sampled_indices]
        y_train_sampled = y_train.iloc[sampled_indices].values

        model_B = LinearRegression()
        model_B.fit(X_train_sampled, y_train_sampled)
        y_pred_B = model_B.predict(X_test_transformed)

        mse = mean_squared_error(y_test, y_pred_B)
        mse_scores.append(mse)

    average_mse_B = np.mean(mse_scores)
    average_rmse_B = np.sqrt(average_mse_B)

    return average_mse_B, average_rmse_B

def get_varied_sampled_indices(clusters, k_clusters, fold):
    np.random.seed(42 + fold)  # Change seed for each fold
    sampled_indices = []
    for cluster in range(k_clusters):
        cluster_indices = np.where(clusters == cluster)[0]
        np.random.shuffle(cluster_indices)  # Shuffle indices for variety
        sampled_cluster_indices = cluster_indices[:int(len(cluster_indices) * 0.7)]
        sampled_indices.extend(sampled_cluster_indices)
    return np.array(sampled_indices)

# Example call to the function with k=5 clusters for demonstration (adjust as needed)
average_mse_B, average_rmse_B = approach_B_with_varied_sampling(X, y, k_clusters=2, n_splits=5)
print(f"Average MSE for Approach B with varied sampling: {average_mse_B}")
print(f"Average RMSE for Approach B with varied sampling: {average_rmse_B}")




Average MSE for Approach B with varied sampling: 3.2227236893918746e+22
Average RMSE for Approach B with varied sampling: 179519461045.08765


check with different k values

In [39]:
# Assuming all necessary imports and the approach_B_with_varied_sampling function are already defined

# Load your dataset
data = pd.read_csv('/content/52 CO2_Emissions_Canada (1).csv')

# Define features and target variable
X = data.drop(columns=['CO2 Emissions(g/km)'])
y = data['CO2 Emissions(g/km)']

# Define the preprocessor with the appropriate columns based on your dataset
categorical_features = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']  # Update as needed
numerical_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
                      'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
                      'Fuel Consumption Comb (mpg)']  # Update as needed

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop')

# Loop to check with different k cluster values
k_values = [2, 3, 4, 5]  # Adjust the range of k as needed
results = {}

for k in k_values:
    print(f"Evaluating Approach B with k={k} clusters...")
    average_mse_B, average_rmse_B = approach_B_with_varied_sampling(X, y, k_clusters=k, n_splits=5)
    results[k] = {'Average MSE': average_mse_B, 'Average RMSE': average_rmse_B}
    print(f"Results for k={k}: Average MSE - {average_mse_B}, Average RMSE - {average_rmse_B}")

# Displaying final results
print("\nFinal Results Across Different k Values:")
for k, metrics in results.items():
    print(f"k={k}: {metrics}")


Evaluating Approach B with k=2 clusters...




Results for k=2: Average MSE - 3.2227236893918746e+22, Average RMSE - 179519461045.08765
Evaluating Approach B with k=3 clusters...




Results for k=3: Average MSE - 7.692056082265815e+22, Average RMSE - 277345562110.98486
Evaluating Approach B with k=4 clusters...




Results for k=4: Average MSE - 9.211304337927644e+23, Average RMSE - 959755403106.8356
Evaluating Approach B with k=5 clusters...




Results for k=5: Average MSE - 5.186921551916985e+21, Average RMSE - 72020285697.27411

Final Results Across Different k Values:
k=2: {'Average MSE': 3.2227236893918746e+22, 'Average RMSE': 179519461045.08765}
k=3: {'Average MSE': 7.692056082265815e+22, 'Average RMSE': 277345562110.98486}
k=4: {'Average MSE': 9.211304337927644e+23, 'Average RMSE': 959755403106.8356}
k=5: {'Average MSE': 5.186921551916985e+21, 'Average RMSE': 72020285697.27411}
