# **Title : Mielage_Prediction**

**Importing modules**

In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, r2_score


**Load the dataset**

In [42]:
data = pd.read_csv('/content/MPG.csv')

In [43]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [45]:
data.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [46]:
data.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [47]:
data.isnull().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,6
weight,0
acceleration,0
model_year,0
origin,0
name,0


In [48]:
data.shape

(398, 9)

**Drop and 'name' columns from the input features**

In [49]:
X = data.drop(columns=['mpg', 'name'])
y = data['mpg']

In [50]:
X.isnull().sum()

Unnamed: 0,0
cylinders,0
displacement,0
horsepower,6
weight,0
acceleration,0
model_year,0
origin,0



# Handle missing values in numerical features


In [51]:
from sklearn.impute import SimpleImputer
# Separate numeric columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
# Create the imputer for filling missing values with mean
imputer = SimpleImputer(strategy='mean')
# Apply imputer only to numeric columns
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

**Handling Categorical Variables**

In [52]:
X = pd.get_dummies(X, columns=['origin'], drop_first=True)

# Scale numerical features

In [53]:
from sklearn.preprocessing import StandardScaler

In [54]:
ss = StandardScaler()
X = ss.fit_transform(X)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define regression models to try

In [56]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Support Vector Regression (SVR)': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5)
}

# Function to evaluate models

In [57]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on test set

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, r2


# Iterate through models, train and evaluate them

In [58]:
for name, model in models.items():
    print(f"Training {name}...")
    mae, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    print(f"{name} - MAE: {mae:.4f}, R^2: {r2:.4f}")


Training Linear Regression...
Linear Regression - MAE: 2.2876, R^2: 0.8449
Training Random Forest...
Random Forest - MAE: 1.5989, R^2: 0.9128
Training Gradient Boosting...
Gradient Boosting - MAE: 1.7352, R^2: 0.9022
Training Support Vector Regression (SVR)...
Support Vector Regression (SVR) - MAE: 1.8140, R^2: 0.8743
Training K-Nearest Neighbors...
K-Nearest Neighbors - MAE: 1.8078, R^2: 0.9069
