# Assignment 15: Multiple Linear Regression

## Dataset: Toyota Corolla Price Prediction

**Topics Covered:**
- Linear Regression
- R-squared, Adjusted R-squared
- Feature Selection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv('ToyotaCorolla - MLR.csv')
print("Dataset loaded! Shape:", df.shape)
df.head()

In [None]:
# Check data info
print(df.dtypes)
print("\nMissing values:", df.isnull().sum().sum())

In [None]:
# Select numerical columns for regression
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numerical columns:", numerical_cols)

# Assume 'Price' is target
if 'Price' in numerical_cols:
    target = 'Price'
    feature_cols = [col for col in numerical_cols if col != target]
else:
    target = numerical_cols[-1]
    feature_cols = numerical_cols[:-1]

In [None]:
# Prepare data
X = df[feature_cols].dropna()
y = df.loc[X.index, target]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training:", len(X_train), "Testing:", len(X_test))

In [None]:
# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("=== Model Evaluation ===")
print("MSE:", round(mse, 2))
print("RMSE:", round(rmse, 2))
print("MAE:", round(mae, 2))
print("R-squared:", round(r2, 4))

In [None]:
# Coefficients
print("=== Coefficients ===")
coef_df = pd.DataFrame({
    'Feature': feature_cols,
    'Coefficient': lr.coef_
}).sort_values('Coefficient', key=abs, ascending=False)
print(coef_df)

In [None]:
# Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.savefig('actual_vs_predicted.png')
plt.show()

In [None]:
# Residuals
residuals = y_test - y_pred

plt.figure(figsize=(10, 5))
plt.hist(residuals, bins=30, color='steelblue', edgecolor='black')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.savefig('residuals.png')
plt.show()

## Summary

- Built Multiple Linear Regression model for car price prediction
- R-squared shows the model explains significant variance
- Key features identified through coefficients