In [11]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# -------------------------
# 1. Load the dataset
# -------------------------
df = pd.read_csv("data/stud.csv")

In [6]:

# -------------------------
# 2. Encode categorical columns
# -------------------------
categorical_cols = ["gender", "race_ethnicity", "parental_level_of_education", "lunch", "test_preparation_course"]

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

In [9]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,0,4,5,1,1,61,51,48
1,0,0,5,1,1,40,91,87
2,1,2,0,1,0,38,48,74
3,0,0,5,1,0,81,43,100
4,0,2,4,1,0,33,94,31


In [7]:
# -------------------------
# 3. Split features and target
# -------------------------
X = df[categorical_cols]  # Features: categorical columns
y = df[["math_score", "reading_score", "writing_score"]]  # Targets

In [10]:
X

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
0,0,4,5,1,1
1,0,0,5,1,1
2,1,2,0,1,0
3,0,0,5,1,0
4,0,2,4,1,0
...,...,...,...,...,...
95,1,1,5,1,1
96,0,3,0,0,0
97,1,0,4,0,0
98,0,2,3,1,1


In [11]:

# -------------------------
# 4. Train-test split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# -------------------------
# 5. Feature scaling (optional for tree-based models)
# -------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# -------------------------
# 6. Model training
# Using RandomForestRegressor for multi-output regression
# -------------------------
from sklearn.multioutput import MultiOutputRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
multi_rf = MultiOutputRegressor(rf)
multi_rf.fit(X_train_scaled, y_train)

0,1,2
,estimator,RandomForestR...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# -------------------------
# 7. Predictions
# -------------------------
y_pred = multi_rf.predict(X_test_scaled)

In [15]:
y_pred

array([[72.84      , 48.7       , 63.81      ],
       [67.90533333, 49.68995238, 65.28166667],
       [67.90533333, 49.68995238, 65.28166667],
       [65.37166667, 53.81      , 56.59      ],
       [70.53      , 69.72      , 70.66      ],
       [79.48      , 55.31      , 51.73      ],
       [66.3       , 42.98      , 63.5       ],
       [69.6       , 53.28      , 67.3       ],
       [56.54416667, 67.1775    , 64.84833333],
       [51.00266667, 70.74916667, 76.58733333],
       [67.47      , 70.59333333, 85.16583333],
       [64.46321429, 69.076     , 71.16      ],
       [75.985     , 69.26666667, 85.405     ],
       [54.41      , 64.8875    , 66.53583333],
       [71.55190476, 71.40711905, 78.19333333],
       [54.374     , 55.75      , 57.1325    ],
       [61.61490476, 69.89678571, 51.75160714],
       [81.96      , 85.36      , 36.33      ],
       [63.875     , 68.3525    , 56.89333333],
       [55.64066667, 72.37128571, 72.15157143]])

In [16]:
# -------------------------
# 8. Model evaluation
# -------------------------
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R2 Score:", r2)


Mean Squared Error: 630.5863753850646
R2 Score: -0.3076544289872985


In [20]:
# Encode categorical columns with separate LabelEncoders
categorical_cols = ["gender", "race_ethnicity", "parental_level_of_education", "lunch", "test_preparation_course"]
encoders = {}  # store encoders for later use
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [26]:
new_student = np.array([[
    encoders['gender'].transform(['female'])[0],
    encoders['race_ethnicity'].transform(['group B'])[0],
    encoders['parental_level_of_education'].transform(["bachelor's degree"])[0],
    encoders['lunch'].transform(['standard'])[0],
    encoders['test_preparation_course'].transform(['completed'])[0]
]])

new_student_scaled = scaler.transform(new_student)
predicted_scores = multi_rf.predict(new_student_scaled)
print("Predicted scores (math, reading, writing):", predicted_scores)

Predicted scores (math, reading, writing): [[72.84 48.7  63.81]]




In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

In [9]:
df = pd.read_csv("data/stud.csv")

In [14]:

# Define columns
numeric_cols = ["math_score", "reading_score", "writing_score"]
categorical_cols = ["gender", "race_ethnicity", "parental_level_of_education", "lunch", "test_preparation_course"]

# Example: apply log transformation to math_score
log_transformer = FunctionTransformer(np.log1p)  # log(1 + x) to avoid log(0)

# ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num_scaler", StandardScaler(), numeric_cols),          # scale numeric columns
    ("cat_encoder", OneHotEncoder(drop='first'), categorical_cols),  # one-hot encode categorical columns
    ("log_math", log_transformer, ["math_score"])            # log transform math_score
])

# Split features and target
X = df[categorical_cols + numeric_cols]
y = df[numeric_cols]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the transformer
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print("Original shape:", X_train.shape)
print("Transformed shape:", X_train_transformed.shape)

# Example: train RandomForest on transformed data
rf = RandomForestRegressor(n_estimators=100, random_state=42)
multi_rf = MultiOutputRegressor(rf)
multi_rf.fit(X_train_transformed, y_train)
y_pred = multi_rf.predict(X_test_transformed)

print("Prediction example:\n", y_pred[:5])

Original shape: (80, 8)
Transformed shape: (80, 16)
Prediction example:
 [[54.06 38.43 32.5 ]
 [79.06 34.28 60.75]
 [94.63 57.48 94.51]
 [40.01 97.86 66.22]
 [86.16 37.94 94.13]]
