# **Multiple Linear Regression with scikit-learn**
Student Performance Dataset

# Import required libraries

We import:

pandas, numpy for data handling

sklearn for preprocessing, model, and evaluation

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [3]:
import pandas as pd
import numpy as np

np.random.seed(42)
n = 200   # number of students

df = pd.DataFrame({
    "age": np.random.randint(15, 22, n),
    "address": np.random.choice(["U", "R"], n),
    "famsize": np.random.choice(["LE3", "GT3"], n),
    "reason": np.random.choice(["home", "reputation", "course", "other"], n),
    "studytime": np.random.randint(1, 5, n),
    "failures": np.random.randint(0, 4, n),
    "schoolsup": np.random.choice(["yes", "no"], n),
    "famsup": np.random.choice(["yes", "no"], n),
    "paid": np.random.choice(["yes", "no"], n),
    "activities": np.random.choice(["yes", "no"], n),
    "higher": np.random.choice(["yes", "no"], n),
    "internet": np.random.choice(["yes", "no"], n),
    "romantic": np.random.choice(["yes", "no"], n),
    "freetime": np.random.randint(1, 6, n),
    "goout": np.random.randint(1, 6, n),
    "health": np.random.randint(1, 6, n),
    "absences": np.random.randint(0, 30, n),
    "G1": np.random.randint(30, 100, n),
    "G2": np.random.randint(30, 100, n)
})

# Target variable
df["G3"] = (0.4 * df["G1"] + 0.6 * df["G2"] + np.random.randint(-5, 5, n)).clip(0, 100)

# Save CSV
df.to_csv("multiple_linear_data.csv", index=False)

print("Dataset created and saved as multiple_linear_data.csv")
df.head()


Dataset created and saved as multiple_linear_data.csv


Unnamed: 0,age,address,famsize,reason,studytime,failures,schoolsup,famsup,paid,activities,higher,internet,romantic,freetime,goout,health,absences,G1,G2,G3
0,21,R,GT3,reputation,2,2,no,yes,yes,no,no,yes,yes,2,1,4,15,34,49,42.0
1,18,U,LE3,course,3,0,no,no,yes,no,no,yes,yes,3,5,2,28,32,55,48.8
2,19,U,GT3,course,4,0,yes,yes,no,yes,no,no,yes,1,5,4,0,94,97,98.8
3,21,R,LE3,home,1,2,yes,yes,yes,no,yes,no,no,4,1,5,9,33,44,37.6
4,17,R,GT3,other,1,3,no,yes,yes,yes,yes,yes,no,2,3,5,17,93,43,60.0


# Load the dataset
Load the CSV file and display the first 5 rows to understand the structure.

Make sure the file exists at: data/multiple_linear_data.csv



In [4]:
df = pd.read_csv("multiple_linear_data.csv")
df.head()


Unnamed: 0,age,address,famsize,reason,studytime,failures,schoolsup,famsup,paid,activities,higher,internet,romantic,freetime,goout,health,absences,G1,G2,G3
0,21,R,GT3,reputation,2,2,no,yes,yes,no,no,yes,yes,2,1,4,15,34,49,42.0
1,18,U,LE3,course,3,0,no,no,yes,no,no,yes,yes,3,5,2,28,32,55,48.8
2,19,U,GT3,course,4,0,yes,yes,no,yes,no,no,yes,1,5,4,0,94,97,98.8
3,21,R,LE3,home,1,2,yes,yes,yes,no,yes,no,no,4,1,5,9,33,44,37.6
4,17,R,GT3,other,1,3,no,yes,yes,yes,yes,yes,no,2,3,5,17,93,43,60.0


# Check basic dataset information
This helps identify:

Data types

Categorical vs numerical columns

Missing values


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         200 non-null    int64  
 1   address     200 non-null    object 
 2   famsize     200 non-null    object 
 3   reason      200 non-null    object 
 4   studytime   200 non-null    int64  
 5   failures    200 non-null    int64  
 6   schoolsup   200 non-null    object 
 7   famsup      200 non-null    object 
 8   paid        200 non-null    object 
 9   activities  200 non-null    object 
 10  higher      200 non-null    object 
 11  internet    200 non-null    object 
 12  romantic    200 non-null    object 
 13  freetime    200 non-null    int64  
 14  goout       200 non-null    int64  
 15  health      200 non-null    int64  
 16  absences    200 non-null    int64  
 17  G1          200 non-null    int64  
 18  G2          200 non-null    int64  
 19  G3          200 non-null    f

# Separate categorical and numerical columns
We identify which columns need encoding.

In [6]:
categorical_cols = [
    "address", "famsize", "reason", "schoolsup", "famsup",
    "paid", "activities", "higher", "internet", "romantic"
]

numerical_cols = [
    "age", "studytime", "failures", "freetime", "goout",
    "health", "absences", "G1", "G2"
]


# One-Hot Encode categorical features
Machine learning models need numeric input.
We convert categorical columns using One-Hot Encoding.

In [7]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

df_encoded.head()


Unnamed: 0,age,studytime,failures,freetime,goout,health,absences,G1,G2,G3,...,reason_home,reason_other,reason_reputation,schoolsup_yes,famsup_yes,paid_yes,activities_yes,higher_yes,internet_yes,romantic_yes
0,21,2,2,2,1,4,15,34,49,42.0,...,False,False,True,False,True,True,False,False,True,True
1,18,3,0,3,5,2,28,32,55,48.8,...,False,False,False,False,False,True,False,False,True,True
2,19,4,0,1,5,4,0,94,97,98.8,...,False,False,False,True,True,False,True,False,False,True
3,21,1,2,4,1,5,9,33,44,37.6,...,True,False,False,True,True,True,False,True,False,False
4,17,1,3,2,3,5,17,93,43,60.0,...,False,True,False,False,True,True,True,True,True,False


# Define X (features) and y (target)
X → all columns except G3

y → target variable G3

In [8]:
X = df_encoded.drop("G3", axis=1)
y = df_encoded["G3"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (200, 21)
y shape: (200,)


# Train Multiple Linear Regression Model
We initialize and train a linear regression model using all features.

In [9]:
model = LinearRegression()
model.fit(X, y)

print("Model training completed")


Model training completed


# Predict on training data
The lab requires computing MSE on the same dataset.

In [10]:
y_pred = model.predict(X)


# Calculate Mean Squared Error (MSE)
MSE measures how far predictions are from actual values.

In [11]:
mse = mean_squared_error(y, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 7.540307832110909


# Predict on new custom
We define a NumPy array manually and predict the final grade.

 The array must match the number and order of X columns.

In [12]:
# Create dummy input with correct number of features
new_data = np.zeros(X.shape[1]).reshape(1, -1)

predicted_grade = model.predict(new_data)
print("Predicted grade:", predicted_grade[0])


Predicted grade: -3.0494599015913835




# Model coefficients (optional but useful)
Shows how each feature influences the final grade.

In [13]:
coefficients = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_
})

coefficients.sort_values(by="Coefficient", ascending=False).head(10)


Unnamed: 0,Feature,Coefficient
20,romantic_yes,0.801448
14,schoolsup_yes,0.662203
8,G2,0.620746
7,G1,0.394476
12,reason_other,0.367314
16,paid_yes,0.359612
5,health,0.296321
15,famsup_yes,0.248308
1,studytime,0.21101
3,freetime,0.178816


# Ridge Regression (Optional)
Ridge helps reduce overfitting by penalizing large coefficients.

In [14]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X, y)

ridge_pred = ridge.predict(X)
ridge_mse = mean_squared_error(y, ridge_pred)

print("Ridge Regression MSE:", ridge_mse)


Ridge Regression MSE: 7.540661822625733


# Lasso Regression (Optional)
Lasso can push some coefficients to zero (feature selection).

In [15]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01)
lasso.fit(X, y)

lasso_pred = lasso.predict(X)
lasso_mse = mean_squared_error(y, lasso_pred)

print("Lasso Regression MSE:", lasso_mse)


Lasso Regression MSE: 7.546587213390096


The Multiple Linear Regression model was successfully trained after converting
categorical features into numerical form using one-hot encoding.

The Mean Squared Error indicates the average squared difference between predicted
and actual final grades. Regularized models such as Ridge and Lasso help control
overfitting and can improve generalization.

This experiment demonstrates how linear models can be extended to handle multiple
features, including categorical data, in real-world datasets.
