In [12]:
import os
import pandas as pd

# Load the data
data = pd.read_csv('CO2 Emissions_Canada.csv')

# Data cleaning process.....
data = data.dropna()  # Remove rows with missing values
data.columns = data.columns.str.lower().str.replace(' ', '').str.replace('[^a-z0-9]', '', regex=True)

# Define output file path
output_dir = r'Vehicle_Emission_Predictor/data'
output_file = os.path.join(output_dir, 'cleaned_emissions.csv')

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Save the cleaned dataset
try:
    data.to_csv(output_file, index=False)
    print(f"Cleaned data saved to: {output_file}")
except Exception as e:
    print(f"An error occurred: {e}")


Cleaned data saved to: Vehicle_Emission_Predictor/data\cleaned_emissions.csv


In [13]:
import pandas as pd

# Path to the cleaned data
data_path = r'Vehicle_Emission_Predictor/data/cleaned_emissions.csv'

# Load the cleaned data
try:
    data = pd.read_csv(data_path)
    print(f"Data loaded successfully from: {data_path}")
except FileNotFoundError:
    print(f"File not found at the specified path: {data_path}")
    exit()
except Exception as e:
    print(f"An error occurred while loading the data: {e}")
    exit()

# Inspect the dataset
print("First few rows of the dataset:\n", data.head())
print("\nColumns in the dataset:", data.columns)

# Data cleaning (if necessary)
print("\nMissing values in each column:\n", data.isnull().sum())
data = data.dropna()  # Drop rows with missing values
print("\nData after cleaning:\n", data.head())

# Select features and target
try:
    X = data[['engine_size', 'year', 'fuel_type']]
    y = data['emissions']
    print("\nFeatures and target selected successfully.")
except KeyError as e:
    print(f"Error: {e}")
    print("Please check the column names in the dataset.")


Data loaded successfully from: Vehicle_Emission_Predictor/data/cleaned_emissions.csv
First few rows of the dataset:
     make       model vehicleclass  enginesizel  cylinders transmission  \
0  ACURA         ILX      COMPACT          2.0          4          AS5   
1  ACURA         ILX      COMPACT          2.4          4           M6   
2  ACURA  ILX HYBRID      COMPACT          1.5          4          AV7   
3  ACURA     MDX 4WD  SUV - SMALL          3.5          6          AS6   
4  ACURA     RDX AWD  SUV - SMALL          3.5          6          AS6   

  fueltype  fuelconsumptioncityl100km  fuelconsumptionhwyl100km  \
0        Z                        9.9                       6.7   
1        Z                       11.2                       7.7   
2        Z                        6.0                       5.8   
3        Z                       12.7                       9.1   
4        Z                       12.1                       8.7   

   fuelconsumptioncombl100km  fuelc

In [18]:
import pandas as pd
import random

# Load the dataset
project_dir = os.getcwd()  # Assuming you're running from the project directory
file_path = os.path.join(project_dir, 'Vehicle_Emission_Predictor/data/cleaned_emissions.csv')

data = pd.read_csv(file_path)

# List of valid fuel types
fuel_types = ['Petrol', 'Diesel', 'Electric', 'Hybrid']

# Randomly assign a fuel type from the list for each row in the 'fueltype' column
data['fueltype'] = [random.choice(fuel_types) for _ in range(len(data))]

# Optionally, you can save this modified dataset to a new CSV file
modified_file_path = os.path.join(project_dir, 'Vehicle_Emission_Predictor/data/modified_emissions.csv')
data.to_csv(modified_file_path, index=False)

# Check the first few rows of the modified dataset
print(data.head())


    make       model vehicleclass  enginesizel  cylinders transmission  \
0  ACURA         ILX      COMPACT          2.0          4          AS5   
1  ACURA         ILX      COMPACT          2.4          4           M6   
2  ACURA  ILX HYBRID      COMPACT          1.5          4          AV7   
3  ACURA     MDX 4WD  SUV - SMALL          3.5          6          AS6   
4  ACURA     RDX AWD  SUV - SMALL          3.5          6          AS6   

  fueltype  fuelconsumptioncityl100km  fuelconsumptionhwyl100km  \
0   Petrol                        9.9                       6.7   
1   Petrol                       11.2                       7.7   
2   Hybrid                        6.0                       5.8   
3   Hybrid                       12.7                       9.1   
4   Hybrid                       12.1                       8.7   

   fuelconsumptioncombl100km  fuelconsumptioncombmpg  co2emissionsgkm  
0                        8.5                      33              196  
1       

In [25]:
print(os.getcwd())

C:\Users\kumar\Desktop\Aaradhya's Project


In [26]:
import os
import pandas as pd

# Get the current working directory
project_dir = os.getcwd()
print
# Build the path to the cleaned_emissions.csv file based on the current directory
file_path = os.path.join(project_dir, 'Vehicle_emission_Predictor','data', 'cleaned_emissions.csv')

# Print the file path to ensure it's correct
print(f"File path: {file_path}")

# Check if the file exists
if not os.path.exists(file_path):
    print(f"Error: The file at {file_path} does not exist.")
else:
    # Load the cleaned dataset
    try:
        data = pd.read_csv(file_path)
        # Print the first few rows of the dataset to check
        print(data.head())

        # Rename columns for better clarity
        data = data.rename(columns={
            'enginesizel': 'engine_size',
            'fueltype': 'fuel',
            'co2emissionsgkm': 'co2_emissions',
            'fuelconsumptioncityl100km': 'fuel_consumption_city',
            'fuelconsumptionhwyl100km': 'fuel_consumption_highway'
        })

        # Print the modified dataset columns
        print(data.head())

    except Exception as e:
        print(f"Error loading the file: {e}")


File path: C:\Users\kumar\Desktop\Aaradhya's Project\Vehicle_emission_Predictor\data\cleaned_emissions.csv
    make       model vehicleclass  enginesizel  cylinders transmission  \
0  ACURA         ILX      COMPACT          2.0          4          AS5   
1  ACURA         ILX      COMPACT          2.4          4           M6   
2  ACURA  ILX HYBRID      COMPACT          1.5          4          AV7   
3  ACURA     MDX 4WD  SUV - SMALL          3.5          6          AS6   
4  ACURA     RDX AWD  SUV - SMALL          3.5          6          AS6   

  fueltype  fuelconsumptioncityl100km  fuelconsumptionhwyl100km  \
0        Z                        9.9                       6.7   
1        Z                       11.2                       7.7   
2        Z                        6.0                       5.8   
3        Z                       12.7                       9.1   
4        Z                       12.1                       8.7   

   fuelconsumptioncombl100km  fuelconsumption

In [27]:
# Select the features (X) and target (y)
X = data[['engine_size', 'fuel', 'fuel_consumption_city', 'fuel_consumption_highway', 'cylinders']]  # Example features
y = data['co2_emissions']  # Target variable

# One-hot encode the 'fuel' column and drop the first category to avoid multicollinearity
X = pd.get_dummies(X, columns=['fuel'], drop_first=True)

# Check if one-hot encoding worked
print(X.head())


   engine_size  fuel_consumption_city  fuel_consumption_highway  cylinders  \
0          2.0                    9.9                       6.7          4   
1          2.4                   11.2                       7.7          4   
2          1.5                    6.0                       5.8          4   
3          3.5                   12.7                       9.1          6   
4          3.5                   12.1                       8.7          6   

   fuel_E  fuel_N  fuel_X  fuel_Z  
0   False   False   False    True  
1   False   False   False    True  
2   False   False   False    True  
3   False   False   False    True  
4   False   False   False    True  


In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the split data shapes
print(X_train.shape, X_test.shape)


(5908, 8) (1477, 8)


In [29]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Ensure the scaling was done correctly
print(X_train_scaled[:5])


[[-0.12149345 -0.33509065 -0.42937884  0.20534188 -0.22802259  0.
  -0.98822103  1.14162772]
 [ 0.6141664   0.6072775   0.56440556  0.20534188 -0.22802259  0.
   1.01191937 -0.87594229]
 [-0.12149345  0.350268   -0.02283068  0.20534188 -0.22802259  0.
  -0.98822103  1.14162772]
 [ 0.17277049 -0.36364727 -0.51972288  0.20534188 -0.22802259  0.
  -0.98822103  1.14162772]
 [-0.85715331 -0.70632659 -0.92627105 -0.88513888 -0.22802259  0.
  -0.98822103  1.14162772]]


In [30]:
# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Check the R-squared score on the training data
print(f"Training R-squared: {model.score(X_train_scaled, y_train):.4f}")

# Check the R-squared score on the testing data
print(f"Testing R-squared: {model.score(X_test_scaled, y_test):.4f}")


Training R-squared: 0.9915
Testing R-squared: 0.9884


In [31]:
from sklearn.metrics import mean_squared_error

# Predict the values on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")


Mean Squared Error: 39.86


In [32]:
# Create a directory for saving the model and scaler
model_dir = os.path.join(project_dir, 'model')
os.makedirs(model_dir, exist_ok=True)

# Save the trained model
with open(os.path.join(model_dir, 'emissions_model.pkl'), 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the scaler
with open(os.path.join(model_dir, 'scaler.pkl'), 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [33]:
# Load the saved model and scaler
with open(os.path.join(model_dir, 'emissions_model.pkl'), 'rb') as model_file:
    model = pickle.load(model_file)

with open(os.path.join(model_dir, 'scaler.pkl'), 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Example input for prediction (engine_size, year, fuel_type, fuel_consumption_city, fuel_consumption_highway, cylinders)
example_input = [2.0, 2022, 'Diesel', 8.0, 7.5, 4]

# One-hot encode the fuel type in the input
fuel_encoded = {'Petrol': [1, 0, 0], 'Diesel': [0, 1, 0], 'Electric': [0, 0, 1], 'Hybrid': [0, 0, 0]}[example_input[2]]

# Combine the input features
input_features = [example_input[0], example_input[1]] + fuel_encoded + example_input[3:5] + [example_input[5]]

# Scale the input features using the saved scaler
input_scaled = scaler.transform([input_features])

# Make the prediction
predicted_emissions = model.predict(input_scaled)

# Output the prediction
print(f"Predicted CO2 Emissions: {predicted_emissions[0]:.2f} kg/year")


Predicted CO2 Emissions: 25171.06 kg/year


