# Exercise 1.0

In [None]:
# Import libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load the mpg dataset

df = sns.load_dataset("mpg")
df.head()

In [None]:
# a) Start by doing some initial EDA such as info(),describe() and figure out what you want to do with the missing values.


# Basic struc: Look at first rows

df.shape

In [None]:
#  Basic struc:
df.columns

In [None]:
#  Basic struc: Check dataset structure

df.info()

In [None]:
# Summary statistics for all columns
df.describe()

In [None]:
# Handle missing values

df.isna().sum()

In [None]:
# b) Use describe only on those columns that are relevant to get statistical information from.

# Select relevant columns
num_cols = [
    "mpg",
    "cylinders",
    "displacement",
    "horsepower",
    "weight",
    "acceleration",
    "model_year"
]

df[num_cols].describe()

In [None]:
# c) Make some plots on some of the columns that you find interesting.

# Dvs. plots for intersting columns

# c.1)Distribution of target variable MPG

plt.figure()
plt.hist(df["mpg"], bins=30)
plt.xlabel("MPG")
plt.ylabel("Count")
plt.title("Distribution of MPG")
plt.show()



In [None]:
# c.2) MPG vs weight(classic)

plt.figure()
plt.scatter(df["weight"], df["mpg"], alpha=0.7)
plt.xlabel("Weight")
plt.ylabel("MPG")
plt.title("MPG vs Weight")
plt.show()

In [None]:
# c.3) MPG vs horsepower

plt.figure()
plt.scatter(df["horsepower"], df["mpg"], alpha=0.7)
plt.xlabel("Horsepower")
plt.ylabel("MPG")
plt.title("MPG vs Horsepower")
plt.show()


In [None]:
# c.4) MPG vs modelyear

plt.figure()
plt.scatter(df["model_year"], df["mpg"], alpha=0.7)
plt.xlabel("Model Year")
plt.ylabel("MPG")
plt.title("MPG over Model Year")
plt.show()


In [None]:
# c.5) MPG vs cylinders

plt.figure()
sns.boxplot(x="cylinders" , y="mpg",data= df)
plt.title ("MPG by Nr of cylinders")
plt.show()

In [None]:
# d) Check if there are any columns you might want to drop

# d.1) Review all columns

df.columns


In [None]:
# d.2) Decide what to drop

# name & (origin)

# df = df.drop(columns=["name"])


Summary of steps (numbered)

- Load mpg dataset

- Use info() to understand structure

- Use describe() for overview

- Handle missing values in horsepower

- Select relevant numeric columns

- Plot mpg distribution

- Plot mpg vs important features

- Identify and drop unnecessary columns

# Exercise 1.1

In [None]:
# a) We want to predict the "mpg", split up X and y, and perform train|test split using scikit-learn. Choose test_size of 0.2 and random_state 42. Control the shapes of each X_train, X_test, y_train, y_test.

In [None]:
# Drop rows with missing horsepower

df_clean = df.dropna().copy()
df_clean.shape

In [None]:
# Drops irrelevant columns

df_clean = df_clean.drop(columns=["name"])
df_clean.columns

In [None]:
df_clean = df_clean.drop(columns=["origin"])
df_clean.columns

In [None]:
# a) Split X and y and do train test split


# 1) Define target variable y

y = df_clean["mpg"]


In [None]:
# 2) Define feature matrix X

X = df_clean.drop(columns=["mpg"])


In [None]:
X.shape, y.shape

In [None]:
# 3) Import train | test split

from sklearn.model_selection import train_test_split


In [None]:
# 4) Perform train test split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
# 5) Check shapes

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


In [None]:
# b) Create a function for training a regression model, predicting and computing the metrics MAE, MSE, RMSE. It should take in parameters of X_train, X_test, y_train, y_test, model. Now create a linear regression model using scikit-learns LinearRegression() (OLS normal equation with SVD) and call your function to get metrics.

# Dvs. Function for training, predicting, and metrics

# 6) Import metrics

from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np


In [None]:
# 7) Create evaluation function


def train_evaluate_model(X_train, X_test, y_train, y_test, model):
    # train
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    
    # metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    return mae, mse, rmse


In [None]:
# 8) Import Linear Regression model

from sklearn.linear_model import LinearRegression


In [None]:
# 9) Create Linear Regression model

lin_reg = LinearRegression()


In [None]:
# 10) Call function and get metrics

mae, mse, rmse = train_evaluate_model(
    X_train,
    X_test,
    y_train,
    y_test,
    lin_reg
)


In [None]:
# 11) Print results clearly

print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)


Summary of steps (numbered)

- Define target variable y

- Define feature matrix X

- Import train test split

- Split data into train and test sets

- Check shapes of splits

- Import regression metrics

- Create training and evaluation function

- Import LinearRegression

- Create model instance

- Train model and compute metrics

- Print MAE, MSE, RMSE