In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load dataset
data = pd.read_csv(r"data\auto-mpg.csv")
print("First 5 rows of data:\n", data.head(), "\n")

# Step 2: Rename columns for easier handling
data.rename(columns={'model year': 'model_year', 'car name': 'car_name'}, inplace=True)

# Step 3: Replace '?' with NaN and drop rows with missing values
data = data.replace('?', pd.NA)
data = data.dropna()

# Step 4: Convert horsepower to float
data['horsepower'] = data['horsepower'].astype(float)

# Step 5: Select features and target
features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin']
X = data[features]
y = data['mpg']
print("Final dataset shape:", X.shape, y.shape, "\n")

# Step 6: Split into training & test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Step 7: Train Linear Regression model
lrr = LinearRegression()
lrr.fit(x_train, y_train)

# Step 8: Predictions
y_pred = lrr.predict(x_test)

# Step 9: Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression MSE:", mse)
print("Linear Regression R2:", r2)


First 5 rows of data:
     mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   
3  16.0          8         304.0        150    3433          12.0          70   
4  17.0          8         302.0        140    3449          10.5          70   

   origin                   car name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino   

Final dataset shape: (392, 7) (392,) 

Linear Regression MSE: 11.669166827122238
Linear Regression R2: 0.7912565395128667
