In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import pickle

In [None]:
data= pd.read_csv('Carbon_Footprint_Emission.csv')

data.columns = [col.strip() for col in data.columns]
print(data.columns)

Index(['Energy_Source', 'Transport', 'Frequency of Transport',
       'Waste Production', 'How Long Machine Works Daily',
       'Machine repairing(in Months)', 'Energy efficiency', 'Recycling Waste',
       'CarbonEmission'],
      dtype='object')


In [18]:
data.columns.to_list()

['Energy_Source',
 'Transport',
 'Frequency of Transport',
 'Waste Production',
 'How Long Machine Works Daily',
 'Machine repairing(in Months)',
 'Energy efficiency',
 'Recycling Waste',
 'CarbonEmission']

In [None]:
target = 'CarbonEmission'
features = [col for col in data.columns if col != target]

categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()

if target in numerical_cols:
    numerical_cols.remove(target)

print('Categorical columns:', categorical_cols)
print('Numerical columns:', numerical_cols)


Categorical columns: ['Energy_Source', 'Transport', 'Frequency of Transport', 'Waste Production', 'Energy efficiency', 'Recycling Waste']
Numerical columns: ['How Long Machine Works Daily', 'Machine repairing(in Months)']


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])
lr_model = LinearRegression()

X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lr_model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# print('Model: Linear Regression')
# print('MSE:', mse)
# print('R2:', r2)

print('Best performing model: Linear Regression')
print('done')

Model: Linear Regression
MSE: 598163.065
R2: 0.42468185812503045
Best performing model: Linear Regression
done


In [None]:
# pickle.dump(pipeline,open('pipe.pkl','wb'))
# pipe = pickle.load(open('pipe.pkl','rb'))