In [3]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.linear_model import LogisticRegression


In [4]:
## Loading the data
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

# Load ground truth if available (for local evaluation)
import os
if os.path.exists('reference.csv'):
    y_true = pd.read_csv('reference.csv')['Target_AUC']
else:
    y_true = None

In [5]:
for c in [col for col in train_df.columns if col.endswith('date')]:
    if c in train_df.columns:
        train_df[c] = pd.to_datetime(train_df[c], errors='coerce')

In [6]:
target_col = 'adopted_within_07_days'
train_df[target_col] = train_df[target_col].astype(int)

In [7]:
for c in [col for col in test_df.columns if col.endswith('date')]:
    if c in test_df.columns:
        test_df[c] = pd.to_datetime(test_df[c], errors='coerce')

In [8]:
split_summary = pd.DataFrame({
    "set": ["train"],
    "rows": [len(train_df)],
    "positives": [train_df[target_col].sum()],
})
split_summary["pos_rate"] = split_summary["positives"] / split_summary["rows"]

split_summary

Unnamed: 0,set,rows,positives,pos_rate
0,train,16000,2504,0.1565


In [9]:
## Basic model with some selected features
base_features = [
    "gender",
    "registration",
    "age",
    "trainer",
    "belong_to_cooperative",
    "county",
    "subcounty",
    "ward",
]

In [10]:
feature_cols = base_features

X_train = train_df[feature_cols]
y_train = train_df[target_col]

X_test = test_df[feature_cols]

## Make the prediction


In [11]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), feature_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))
])

model.fit(X_train, y_train)
y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

In [12]:
y_pred

array([0, 1, 1, ..., 1, 1, 0])

In [13]:
## Evaluating some metrics
# ----------------------------
#  Metrics: PR AUC + ROC AUC + Recall@K
# ----------------------------
def recall_at_k(y_true: pd.Series, y_scores: np.ndarray, k_frac: float) -> float:
    k = int(np.ceil(len(y_true) * k_frac))
    order = np.argsort(-y_scores)  # descending
    topk = order[:k]
    return float(y_true.iloc[topk].sum() / y_true.sum()) if y_true.sum() > 0 else np.nan

In [14]:
if y_true is not None:
    pr_auc = average_precision_score(y_true, y_pred_prob)
    roc_auc = roc_auc_score(y_true, y_pred_prob)
    recall_5 = recall_at_k(y_true, y_pred_prob, 0.05)
    recall_10 = recall_at_k(y_true, y_pred_prob, 0.10)
    recall_20 = recall_at_k(y_true, y_pred_prob, 0.20)

    # Naive baselines on this test set
    prevalence_test = y_true.mean()
    naive_pr_auc = prevalence_test

    results = pd.DataFrame({
        "metric": [
            "PR AUC (Average Precision)",
            "ROC AUC",
            "Recall@5%",
            "Recall@10%",
            "Recall@20%",
            "Naive PR AUC (prevalence)",
        ],
        "value": [pr_auc, roc_auc, recall_5, recall_10, recall_20, naive_pr_auc]
    })

    print("\nSplit summary:")
    display(split_summary)
    print("\nResults:")
    display(results)
else:
    print("Skipping evaluation: reference.csv not found (y_true unavailable).")

Skipping evaluation: reference.csv not found (y_true unavailable).


In [15]:
y_pred_prob

array([0.06928471, 0.61634289, 0.81484165, ..., 0.65888423, 0.6136258 ,
       0.04899097])

In [16]:
y_pred

array([0, 1, 1, ..., 1, 1, 0])

In [17]:
## Make submission file
ss = pd.read_csv('SampleSubmission.csv')
ss['Target_LogLoss'] = y_pred_prob
ss['Target_AUC'] = y_pred_prob
ss.to_csv('BenchmarkSub.csv', index=False)

In [18]:
ss.head()

Unnamed: 0,ID,Target_07_AUC,Target_07_LogLoss,Target_90_AUC,Target_90_LogLoss,Target_120_AUC,Target_120_LogLoss,Target_LogLoss,Target_AUC
0,ID_6AA1EM,0,0,0,0,0,0,0.069285,0.069285
1,ID_2DV3A1,0,0,0,0,0,0,0.616343,0.616343
2,ID_KZY5B8,0,0,0,0,0,0,0.814842,0.814842
3,ID_T8WZT2,0,0,0,0,0,0,0.079084,0.079084
4,ID_3CX56O,0,0,0,0,0,0,0.814842,0.814842


In [19]:
ss

Unnamed: 0,ID,Target_07_AUC,Target_07_LogLoss,Target_90_AUC,Target_90_LogLoss,Target_120_AUC,Target_120_LogLoss,Target_LogLoss,Target_AUC
0,ID_6AA1EM,0,0,0,0,0,0,0.069285,0.069285
1,ID_2DV3A1,0,0,0,0,0,0,0.616343,0.616343
2,ID_KZY5B8,0,0,0,0,0,0,0.814842,0.814842
3,ID_T8WZT2,0,0,0,0,0,0,0.079084,0.079084
4,ID_3CX56O,0,0,0,0,0,0,0.814842,0.814842
...,...,...,...,...,...,...,...,...,...
5995,ID_FEOS39,0,0,0,0,0,0,0.616343,0.616343
5996,ID_93MFB9,0,0,0,0,0,0,0.036297,0.036297
5997,ID_MD2XHG,0,0,0,0,0,0,0.658884,0.658884
5998,ID_8MXTCP,0,0,0,0,0,0,0.613626,0.613626


In [20]:
from sklearn.metrics import log_loss

In [21]:
## Compute the log-loss
if y_true is not None:
    loss = log_loss(y_true, y_pred_prob)
    print("LogLoss:", loss)
else:
    print("Skipping log-loss: reference.csv not found (y_true unavailable).")

Skipping log-loss: reference.csv not found (y_true unavailable).


In [22]:
train_df.columns

Index(['ID', 'farmer_id', 'gender', 'registration', 'age', 'trainer',
       'group_name', 'belong_to_cooperative', 'county', 'subcounty', 'ward',
       'topics', 'has_topic_trained_on', 'training_date',
       'adopted_within_07_days', 'adopted_within_90_days',
       'adopted_within_120_days'],
      dtype='str')

In [23]:
# ============ DATA EXPLORATION ============
print("=== Train shape:", train_df.shape)
print("=== Test shape:", test_df.shape)
print("\n=== Dtypes ===")
print(train_df.dtypes)
print("\n=== Null counts ===")
print(train_df.isnull().sum())
print("\n=== Target distributions ===")
for t in ['adopted_within_07_days', 'adopted_within_90_days', 'adopted_within_120_days']:
    print(f"  {t}: {train_df[t].mean():.4f}")
print("\n=== Unique values per column ===")
for c in train_df.columns:
    print(f"  {c}: {train_df[c].nunique()}")
print("\n=== Sample topics ===")
print(train_df['topics'].value_counts().head(15))
print("\n=== training_date range ===")
print(f"  Train: {train_df['training_date'].min()} to {train_df['training_date'].max()}")
print(f"  Test:  {test_df['training_date'].min()} to {test_df['training_date'].max()}")
print("\n=== gender ===")
print(train_df['gender'].value_counts())
print("\n=== registration ===")
print(train_df['registration'].value_counts())
print("\n=== age ===")
print(train_df['age'].value_counts())
print("\n=== has_topic_trained_on ===")
print(train_df['has_topic_trained_on'].value_counts())
print("\n=== belong_to_cooperative ===")
print(train_df['belong_to_cooperative'].value_counts())
print("\n=== farmer_id duplicates ===")
dup_farmers = train_df['farmer_id'].value_counts()
print(f"  Total unique farmers: {dup_farmers.shape[0]}")
print(f"  Farmers appearing >1 time: {(dup_farmers > 1).sum()}")
print(f"  Max appearances: {dup_farmers.max()}")

=== Train shape: (16000, 17)
=== Test shape: (6000, 14)

=== Dtypes ===
ID                                    str
farmer_id                             str
gender                                str
registration                          str
age                                   str
trainer                               str
group_name                            str
belong_to_cooperative               int64
county                                str
subcounty                             str
ward                                  str
topics                                str
has_topic_trained_on                int64
training_date              datetime64[us]
adopted_within_07_days              int32
adopted_within_90_days              int64
adopted_within_120_days             int64
dtype: object

=== Null counts ===
ID                         0
farmer_id                  0
gender                     0
registration               0
age                        0
trainer                    0
group