<a href="https://colab.research.google.com/github/Suryaprasadindra/research/blob/main/research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

# Load your dataset
data = pd.read_csv('/content/52 CO2_Emissions_Canada (1).csv')  # Make sure to change this to your actual file path

# Preliminary preprocessing steps
data.drop_duplicates(inplace=True)

# Defining categorical and numerical features
categorical_features = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']
numerical_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100 km)',
                      'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
                      'Fuel Consumption Comb (mpg)']

# Preprocessor for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='drop')

# Split data into features and target
X = data.drop('CO2 Emissions(g/km)', axis=1)
y = data['CO2 Emissions(g/km)']

# Approach A: Linear Regression on preprocessed data
X_train_A, X_test_A, y_train_A, y_test_A = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_A_transformed = preprocessor.fit_transform(X_train_A)
X_test_A_transformed = preprocessor.transform(X_test_A)

model_A = LinearRegression()
model_A.fit(X_train_A_transformed, y_train_A)
y_pred_A = model_A.predict(X_test_A_transformed)

mse_A = mean_squared_error(y_test_A, y_pred_A)
rmse_A = np.sqrt(mse_A)
r2_score_A = r2_score(y_test_A, y_pred_A)

print(f"Approach A - MSE: {mse_A}, RMSE: {rmse_A}, R²: {r2_score_A}")

# Approach B: k-means clustering then Linear Regression
# Function to perform clustering, training, and evaluation
def cluster_and_train(X, y, k):
    X_transformed = preprocessor.fit_transform(X)
    kmeans = KMeans(n_clusters=k, random_state=42).fit(X_transformed)
    clusters = kmeans.labels_

    # Selecting samples for training from each cluster
    training_indices = []
    np.random.seed(42)
    for i in range(k):
        cluster_indices = np.where(clusters == i)[0]
        training_sample = np.random.choice(cluster_indices, size=int(len(cluster_indices) * 0.7), replace=False)
        training_indices.extend(training_sample)

    # Creating training and test datasets
    X_train_B = X_transformed[training_indices]
    y_train_B = y.iloc[training_indices]
    test_indices = list(set(range(X_transformed.shape[0])) - set(training_indices))
    X_test_B = X_transformed[test_indices]
    y_test_B = y.iloc[test_indices]

    # Training and evaluating the model
    model_B = LinearRegression()
    model_B.fit(X_train_B, y_train_B)
    y_pred_B = model_B.predict(X_test_B)

    mse_B = mean_squared_error(y_test_B, y_pred_B)
    rmse_B = np.sqrt(mse_B)
    r2_score_B = r2_score(y_test_B, y_pred_B)

    return mse_B, rmse_B, r2_score_B

# Example usage of Approach B with k=5
mse_B, rmse_B, r2_score_B = cluster_and_train(X, y, 5)
print(f"Approach B with k=5 - MSE: {mse_B}, RMSE: {rmse_B}, R²: {r2_score_B}")

# You can loop through different k values to find the best performing model
k_values = [2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
for k in k_values:
    mse_B, rmse_B, r2_score_B = cluster_and_train(X, y, k)
    print(f"Approach B with k={k} - MSE: {mse_B}, RMSE: {rmse_B}, R²: {r2_score_B}")


Approach A - MSE: 31.754488232790134, RMSE: 5.6351120869766325, R²: 0.9912006271036701




Approach B with k=5 - MSE: 31.604736321259793, RMSE: 5.621808990108059, R²: 0.9912207966405573




Approach B with k=2 - MSE: 37.410911437335585, RMSE: 6.116445980905544, R²: 0.9891382728992398




Approach B with k=3 - MSE: 31.185156407454013, RMSE: 5.584367144758125, R²: 0.9909891770790025




Approach B with k=4 - MSE: 30.475277372716622, RMSE: 5.520441773329071, R²: 0.9913111299496573




Approach B with k=5 - MSE: 31.604736321259793, RMSE: 5.621808990108059, R²: 0.9912207966405573




Approach B with k=6 - MSE: 33.35686112442519, RMSE: 5.775539898955351, R²: 0.9909369604963032




Approach B with k=7 - MSE: 32.90400725755271, RMSE: 5.7362014659138945, R²: 0.9905346941659998




Approach B with k=8 - MSE: 30.096165449297864, RMSE: 5.48599721557511, R²: 0.9914471879953244




Approach B with k=9 - MSE: 31.193862459422213, RMSE: 5.585146592473846, R²: 0.9913611416199818




Approach B with k=10 - MSE: 37.29333812763576, RMSE: 6.106827173552217, R²: 0.9893275804068892




Approach B with k=11 - MSE: 32.12863718632476, RMSE: 5.668212874118681, R²: 0.9908102148582631




Approach B with k=12 - MSE: 29.938940385302665, RMSE: 5.4716487812452534, R²: 0.9916568667816666




Approach B with k=13 - MSE: 28.770952244978016, RMSE: 5.363856098459206, R²: 0.9916771019813561




Approach B with k=14 - MSE: 28.6943815678096, RMSE: 5.356713691043194, R²: 0.991616494201934




Approach B with k=15 - MSE: 28.72844666555683, RMSE: 5.359892411752015, R²: 0.991997374198427




Approach B with k=16 - MSE: 34.290561245905856, RMSE: 5.855814311084826, R²: 0.9903791030126484




Approach B with k=17 - MSE: 29.43461119322476, RMSE: 5.425367378641262, R²: 0.9916297272320092




Approach B with k=18 - MSE: 29.056519304130262, RMSE: 5.390409938411945, R²: 0.9916071549564425




Approach B with k=19 - MSE: 30.826771541373894, RMSE: 5.5521861947681375, R²: 0.991528158193866




Approach B with k=20 - MSE: 36.49218704229128, RMSE: 6.0408763472108316, R²: 0.9896637634565009




Approach B with k=21 - MSE: 33.7516001620269, RMSE: 5.809612737698348, R²: 0.9904871564975297




Approach B with k=22 - MSE: 34.61001763467291, RMSE: 5.883027930808497, R²: 0.9904445584967598




Approach B with k=23 - MSE: 29.772053080640596, RMSE: 5.4563772854010555, R²: 0.9916972875587894




Approach B with k=24 - MSE: 32.92731038622916, RMSE: 5.738232339861219, R²: 0.9907876014204129




Approach B with k=25 - MSE: 30.93264953511107, RMSE: 5.561712823862004, R²: 0.9909671038155501




Approach B with k=26 - MSE: 34.112939101757746, RMSE: 5.840628313953709, R²: 0.9903962857491072




Approach B with k=27 - MSE: 34.22037709399834, RMSE: 5.849818552228636, R²: 0.9902707190379556




Approach B with k=28 - MSE: 34.35171136646485, RMSE: 5.861033301941292, R²: 0.9904243562076069




Approach B with k=29 - MSE: 35.194496688620006, RMSE: 5.932494980075416, R²: 0.9899890973052615




Approach B with k=30 - MSE: 34.3704999270403, RMSE: 5.862635919707133, R²: 0.9908098237722889
