In [None]:
!pip install google-ml-edu==0.1.3 \
  keras~=3.8.0 \
  matplotlib~=3.10.0 \
  numpy~=2.0.0 \
  pandas~=2.2.0 \
  tensorflow~=2.18.0

In [None]:
import numpy as np
import pandas as pd
import keras
import ml_edu.experiment
import ml_edu.results
import plotly.express as px
import matplotlib.pyplot as plt

**Dataset**: http://roycekimmons.com/tools/generated_data/exams

In [None]:
df = pd.read_csv('StudentsPerformance.csv').drop(['gender', 'race/ethnicity', 'lunch', 'parental level of education', 'test preparation course'], axis=1)
df = df[df['math score'] >= 50]

In [None]:
df.head()

Unnamed: 0,math score,reading score,writing score
0,72,72,74
1,69,90,88
2,90,95,93
4,76,78,75
5,71,83,78


In [None]:
px.scatter_matrix(df, dimensions=['reading score', 'math score', 'writing score'])

In [None]:
def create_model(
    settings: ml_edu.experiment.ExperimentSettings,
    metrics: list[keras.metrics.Metric],
) -> keras.Model:
  inputs =  {name: keras.Input(shape=(1,), name=name) for name in settings.input_features}
  concatenated_inputs = keras.layers.Concatenate()(list(inputs.values()))
  outputs = keras.layers.Dense(units=1)(concatenated_inputs)
  model = keras.Model(inputs=inputs, outputs=outputs)

  model.compile(optimizer=keras.optimizers.RMSprop(learning_rate = settings.learning_rate),
                loss="mean_squared_error",
                metrics=metrics)
  return model

def train_model(
    experiment_name: str,
    model: keras.Model,
    dataset: pd.DataFrame,
    label_name: str,
    settings: ml_edu.experiment.ExperimentSettings,
) -> ml_edu.experiment.Experiment:
  features = {name: dataset[name].values for name in settings.input_features}
  label = dataset[label_name].values
  history = model.fit(x=features,
                      y=label,
                      batch_size=settings.batch_size,
                      epochs=settings.number_epochs)

  return ml_edu.experiment.Experiment(
      name=experiment_name,
      settings=settings,
      model=model,
      epochs=history.epoch,
      metrics_history=pd.DataFrame(history.history),
  )

In [44]:
def build_batch(df, batch_size):
  batch = df.sample(batch_size).copy()
  batch.set_index(np.arange(batch_size), inplace=True)
  return batch

In [None]:
settings_1 = ml_edu.experiment.ExperimentSettings(
    learning_rate = 0.001,
    number_epochs = 20,
    batch_size = 50,
    input_features = ['reading score']
)

metrics = [keras.metrics.RootMeanSquaredError(name='rmse')]
model_1 = create_model(settings_1, metrics)
experiment_1 = train_model('one_feature', model_1, df, 'writing score', settings_1)


In [45]:
def predict_writing(df, model, features, label, batch_size=30):
  batch = build_batch(df, batch_size)
  predicted_values = model.predict_on_batch(x={name: batch[name].values for name in features})

  data = {"PREDICTED_SCORE": [], "OBSERVED_SCORE": [], "L1_LOSS": [], features[0]: []}
  for i in range(batch_size):
    predicted =  predicted_values[i][0]
    observed = batch.at[i, label]
    data['PREDICTED_SCORE'].append(predicted)
    data['OBSERVED_SCORE'].append(observed)
    data["L1_LOSS"].append(abs(observed - predicted))
    data[features[0]].append(batch.at[i, features[0]])

  output_df = pd.DataFrame(data)
  return output_df

def show_predictions(output):
    print(output)
    return

output = predict_writing(df, experiment_1.model, experiment_1.settings.input_features, 'reading score')
show_predictions(output)



    PREDICTED_SCORE  OBSERVED_SCORE     L1_LOSS  reading score
0        -46.907280              47   93.907280             47
1        -60.989120              61  121.989120             61
2        -78.088493              78  156.088493             78
3        -70.041725              70  140.041725             70
4        -95.187866              95  190.187866             95
5        -72.053421              72  144.053421             72
6        -64.006653              64  128.006653             64
7        -75.070953              75  150.070953             75
8        -58.977425              59  117.977425             59
9        -74.065109              74  148.065109             74
10       -91.164482              91  182.164482             91
11       -82.111877              82  164.111877             82
12       -63.000809              63  126.000809             63
13       -84.123566              84  168.123566             84
14       -78.088493              78  156.088493        