In [1]:
#Exercise1 
import numpy as np
from sklearn.model_selection import KFold

X = np.array(np.arange(1, 21).reshape(10, -1))
y = np.array(np.arange(1, 11))
kf = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    print(f"Fold: {fold}")
    print(f"TRAIN: {train_index} TEST: {test_index}\n")
    



Fold: 1
TRAIN: [2 3 4 5 6 7 8 9] TEST: [0 1]

Fold: 2
TRAIN: [0 1 4 5 6 7 8 9] TEST: [2 3]

Fold: 3
TRAIN: [0 1 2 3 6 7 8 9] TEST: [4 5]

Fold: 4
TRAIN: [0 1 2 3 4 5 8 9] TEST: [6 7]

Fold: 5
TRAIN: [0 1 2 3 4 5 6 7] TEST: [8 9]



In [2]:
#Exercise2
# imports
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']

# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=43)

# pipeline
pipeline = [('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler()),
            ('lr', LinearRegression())]
pipe = Pipeline(pipeline)

from sklearn.model_selection import cross_validate

# Cross-validate with 10 folds
cv_results = cross_validate(pipe, X_train, y_train, cv=10, scoring='r2')

scores = cv_results['test_score']
print("Scores on validation sets:")
print(scores)

mean_score = scores.mean()
print("\nMean of scores on validation sets:")
print(mean_score)

std_score = scores.std()
print("\nStandard deviation of scores on validation sets:")
print(std_score)

Scores on validation sets:
[0.62433594 0.61648956 0.62486602 0.59891024 0.59284295 0.61307055
 0.54630341 0.60742976 0.60014575 0.59574508]

Mean of scores on validation sets:
0.6020139252674299

Standard deviation of scores on validation sets:
0.02149838227734666


In [3]:
#Exercise 3
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# data
housing = fetch_california_housing()
X, y = housing['data'], housing['target']
# split data train test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    random_state=43)

param_grid = {
    'max_depth': [1, 5, 10, 15, 20],  # Tree depth: deeper trees can overfit
    'n_estimators': [10, 50, 100]     # Number of trees: more trees reduce variance but increase time
}

rf = RandomForestRegressor(random_state=43)  

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Negative MSE (since GridSearch maximizes)
    n_jobs=-1  # Use all CPUs
)

grid_search.fit(X_train, y_train)



0,1,2
,estimator,RandomForestR...ndom_state=43)
,param_grid,"{'max_depth': [1, 5, ...], 'n_estimators': [10, 50, ...]}"
,scoring,'neg_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [4]:
best_estimator = grid_search.best_estimator_
print("Best parameters:")
print(grid_search.best_params_)

print("\nBest score on validation sets (neg MSE):")
print(grid_search.best_score_)

print("\nCV results:")
print(grid_search.cv_results_)

Best parameters:
{'max_depth': 20, 'n_estimators': 100}

Best score on validation sets (neg MSE):
-0.25679220412884723

CV results:
{'mean_fit_time': array([ 0.80924358,  3.32500343,  5.66396303,  2.41164465, 12.70176373,
       27.05458579,  6.21271873, 25.66022062, 51.17474174,  7.60654063,
       35.87718658, 69.52356701,  8.71700459, 37.51236758, 64.21522956]), 'std_fit_time': array([0.14895978, 0.1456139 , 0.23968351, 0.07435274, 0.23395386,
       1.51186278, 0.1238558 , 0.62798879, 1.71104067, 0.32536592,
       1.0762712 , 0.65946019, 0.5222331 , 2.82046688, 1.41885853]), 'mean_score_time': array([0.0147428 , 0.04693117, 0.07122054, 0.01793227, 0.08863463,
       0.1357305 , 0.03567209, 0.12128782, 0.25620799, 0.04138432,
       0.20360188, 0.32593508, 0.0632875 , 0.208358  , 0.36402884]), 'std_score_time': array([0.0031203 , 0.01931807, 0.01268991, 0.0030363 , 0.03105919,
       0.00753217, 0.00653143, 0.01730825, 0.0663813 , 0.00285675,
       0.0571774 , 0.0274543 , 0.012800

In [5]:
test_score = best_estimator.score(X_test, y_test)
print("\nScore on test set (R²):")
print(test_score)


Score on test set (R²):
0.8115764846940576


In [None]:
#Exercise4
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import validation_curve, learning_curve

X, y = make_classification(n_samples=100000,
                        n_features=30,
                        n_informative=10,
                        flip_y=0.2,
                        random_state=42)

param_range = np.arange(1, 21, 2)  

train_scores, valid_scores = validation_curve(
    RandomForestClassifier(random_state=42),
    X, y,
    param_name="max_depth",
    param_range=param_range,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Calculate means and stds
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
valid_mean = np.mean(valid_scores, axis=1)
valid_std = np.std(valid_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(param_range, train_mean, label="Training score", color="blue")
plt.plot(param_range, valid_mean, label="Validation score", color="orange")
plt.fill_between(param_range, train_mean - train_std, train_mean + train_std, alpha=0.1, color="blue")
plt.fill_between(param_range, valid_mean - valid_std, valid_mean + valid_std, alpha=0.1, color="orange")
plt.title("Validation curve : max_depth")
plt.xlabel("max_depth")
plt.ylabel("Score : ROC AUC")
plt.legend(loc="best")

def plot_learning_curve(estimator, title, X, y, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring='accuracy'
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    validation_scores_std = np.std(validation_scores, axis=1)
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std,
                     validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

plot_learning_curve(RandomForestClassifier(max_depth=12, random_state=42), 
                    "Learning Curve (Random Forest)", X, y, cv=10, n_jobs=-1)

