# Car Fuel Efficiency Prediction System

This notebook implements a complete data science lifecycle for predicting car fuel efficiency (MPG) using the Auto MPG dataset.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib
import requests
from io import StringIO

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 1. Data Collection

In [None]:
# Download dataset from UCI
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
response = requests.get(url)
data = StringIO(response.text)
df = pd.read_csv(data, delim_whitespace=True, names=column_names)

# Save to data folder
df.to_csv('data/auto-mpg.csv', index=False)
print('Dataset downloaded and saved to data/auto-mpg.csv')

## 2. Data Cleaning

In [None]:
# Load dataset
df = pd.read_csv('data/auto-mpg.csv')

# Handle missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Convert data types
df['horsepower'] = df['horsepower'].astype(float)
df['weight'] = df['weight'].astype(float)
df['displacement'] = df['displacement'].astype(float)
df['acceleration'] = df['acceleration'].astype(float)

# Drop irrelevant column
df.drop('car_name', axis=1, inplace=True)

# Map origin
origin_map = {1: 'USA', 2: 'Europe', 3: 'Japan'}
df['origin'] = df['origin'].map(origin_map)

print('Data cleaned. Shape:', df.shape)
print(df.head())

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Summary statistics
print('Summary Statistics:')
print(df.describe())

# Distribution plot for mpg
plt.figure(figsize=(8, 5))
sns.histplot(df['mpg'], kde=True)
plt.title('Distribution of MPG')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Pairplot of numerical features
sns.pairplot(numeric_df)
plt.show()

# Boxplot of mpg vs cylinders
plt.figure(figsize=(8, 5))
sns.boxplot(x='cylinders', y='mpg', data=df)
plt.title('MPG vs Cylinders')
plt.show()

# Boxplot of mpg vs origin
plt.figure(figsize=(8, 5))
sns.boxplot(x='origin', y='mpg', data=df)
plt.title('MPG vs Origin')
plt.show()

# Most correlated features
corr_with_mpg = numeric_df.corr()['mpg'].abs().sort_values(ascending=False)
print('Features most correlated with MPG:')
print(corr_with_mpg[1:])

## 4. Feature Engineering

In [None]:
# Define features and target
X = df.drop('mpg', axis=1)
y = df['mpg']

# Create derived feature: power-to-weight ratio
X['power_to_weight'] = X['horsepower'] / X['weight']

# Preprocessing pipeline
numeric_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'power_to_weight']
categorical_features = ['origin']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Feature engineering completed. Training set shape:', X_train.shape)

## 5. Model Training

In [None]:
# Define models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42)
}

# Train and evaluate
results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}
    print(f'{name}: MSE={mse:.2f}, RMSE={rmse:.2f}, MAE={mae:.2f}, R2={r2:.2f}')

# Select best model
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', best_model)])
best_pipeline.fit(X_train, y_train)
print(f'Best model: {best_model_name}')

## 6. Model Evaluation

In [None]:
# Summary table
results_df = pd.DataFrame(results).T
print('Model Performance Summary:')
print(results_df)

# Actual vs Predicted for best model
y_pred_best = best_pipeline.predict(X_test)
plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred_best)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual MPG')
plt.ylabel('Predicted MPG')
plt.title(f'Actual vs Predicted MPG ({best_model_name})')
plt.show()

# Residual distribution
residuals = y_test - y_pred_best
plt.figure(figsize=(8, 5))
sns.histplot(residuals, kde=True)
plt.title(f'Residual Distribution ({best_model_name})')
plt.xlabel('Residuals')
plt.show()

## 7. Model Saving

In [None]:
# Save best model
joblib.dump(best_pipeline, 'models/mpg_predictor.joblib')

# Save cleaned dataset
df.to_csv('data/auto_mpg_cleaned.csv', index=False)

print('âœ… Car Fuel Efficiency Prediction System executed successfully. Model saved at models/mpg_predictor.joblib')