In [10]:
import  random

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import joblib

# loading data (predictors)
train = pd.read_csv("training_data/PreFer_train_data.csv", low_memory = False) 
# loading the outcome
outcome = pd.read_csv("training_data/PreFer_train_outcome.csv") 


In [3]:
train

Unnamed: 0,nomem_encr,outcome_available,cf08a_m,cf09b_m,cf10c_m,cf11d_m,cf12e_m,cf13f_m,cf14g_m,cf15h_m,...,woonvorm_2011,woonvorm_2012,woonvorm_2013,woonvorm_2014,woonvorm_2015,woonvorm_2016,woonvorm_2017,woonvorm_2018,woonvorm_2019,woonvorm_2020
0,712619,0,200803.0,200904.0,,,201203.0,201303.0,,,...,4.0,4.0,4.0,4.0,4.0,4.0,,,,
1,706448,0,200803.0,200903.0,201003.0,201103.0,201203.0,201303.0,201403.0,201509.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
2,729145,0,200803.0,200903.0,201003.0,201103.0,201203.0,,201403.0,201509.0,...,3.0,3.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0
3,729424,0,200803.0,200903.0,,,,,,,...,,,,,,,,,,
4,715619,1,200803.0,200903.0,201003.0,201103.0,201203.0,201303.0,201403.0,201509.0,...,4.0,4.0,4.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6413,708138,0,,,,,,,,,...,,,,,,,,,2.0,2.0
6414,724696,0,,,,,,,,,...,,,,,,,,,,1.0
6415,733061,0,,,,,,,,,...,,,,,,,,,,5.0
6416,708096,0,,,,,,,,,...,,,,,,,,,,3.0


In [30]:
from collections import Counter
Counter(train['birthyear_bg'])

def clean_df(df, background_df=None):
    """
    Preprocess the input dataframe to feed the model.
    # If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command

    Parameters:
    df (pd.DataFrame): The input dataframe containing the raw data (e.g., from PreFer_train_data.csv or PreFer_fake_data.csv).
    background (pd.DataFrame): Optional input dataframe containing background data (e.g., from PreFer_train_background_data.csv or PreFer_fake_background_data.csv).

    Returns:
    pd.DataFrame: The cleaned dataframe with only the necessary columns and processed variables.
    """

    ## This script contains a bare minimum working example
    # Create new variable with age
    df["age"] = 2024 - df["birthyear_bg"]

    # Imputing missing values in age with the mean
    df["age"] = df["age"].fillna(df["age"].mean())

    # Selecting variables for modelling
    keepcols = [
        "nomem_encr",  # ID variable required for predictions,
        "age"          # newly created variable
    ] 

    # Keeping data with variables selected
    df = df[keepcols]

    return df

ftrain = clean_df(train)

In [11]:

def train_save_model(cleaned_df, outcome_df):
    """
    Trains a model using the cleaned dataframe and saves the model to a file.

    Parameters:
    cleaned_df (pd.DataFrame): The cleaned data from clean_df function to be used for training the model.
    outcome_df (pd.DataFrame): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv).
    """
    
    ## This script contains a bare minimum working example
    random.seed(1) # not useful here because logistic regression deterministic
    
    # Combine cleaned_df and outcome_df
    model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")

    # Filter cases for whom the outcome is not available
    model_df = model_df[~model_df['new_child'].isna()]  
    
    # Logistic regression model
    model = LogisticRegression()

    # Fit the model
    model.fit(model_df[['age']], model_df['new_child'])

    # Save the model
    return model

In [13]:
m = train_save_model(ftrain, outcome)

In [15]:
def predict_outcomes(df, background_df, model_path):
    """Generate predictions using the saved model and the input dataframe.

    The predict_outcomes function accepts a Pandas DataFrame as an argument
    and returns a new DataFrame with two columns: nomem_encr and
    prediction. The nomem_encr column in the new DataFrame replicates the
    corresponding column from the input DataFrame. The prediction
    column contains predictions for each corresponding nomem_encr. Each
    prediction is represented as a binary value: '0' indicates that the
    individual did not have a child during 2021-2023, while '1' implies that
    they did.

    Parameters:
    df (pd.DataFrame): The input dataframe for which predictions are to be made.
    background_df (pd.DataFrame): The background dataframe for which predictions are to be made.
    model_path (str): The path to the saved model file (which is the output of training.py).

    Returns:
    pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
    """

    ## This script contains a bare minimum working example
    if "nomem_encr" not in df.columns:
        print("The identifier variable 'nomem_encr' should be in the dataset")

    # Load the model
    model = model_path # joblib.load(model_path)

    # Preprocess the fake / holdout data
    df = clean_df(df, background_df)

    # Exclude the variable nomem_encr if this variable is NOT in your model
    vars_without_id = df.columns[df.columns != 'nomem_encr']

    # Generate predictions from model, should be 0 (no child) or 1 (had child)
    predictions = model.predict(df[vars_without_id])

    # Output file should be DataFrame with two columns, nomem_encr and predictions
    df_predict = pd.DataFrame(
        {"nomem_encr": df["nomem_encr"], "prediction": predictions}
    )

    # Return only dataset with predictions and identifier
    return df_predict

In [28]:
from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate

pred = predict_outcomes(train, background_df=None, model_path=m)
res = pred.join(outcome.set_index('nomem_encr'), on='nomem_encr')
res.dropna(inplace=True)

In [29]:
f1_score(res['prediction'], res['new_child'])

0.0

In [31]:
Counter(pred['prediction'])

Counter({0.0: 6418})

In [33]:
import ollama

response = ollama.chat(model='llama2', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])

ModuleNotFoundError: No module named 'ollama'