In [86]:
import numpy as np
import pandas as pd
import sklearn as sk
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [100]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import Dataset, DataLoader

# Charger les données
data = pd.read_csv("/kaggle/input/full-set-complete-v2-csv/full_set_complete_v2.csv").drop(columns=["Index"])
data = data.sort_values(by=["patient_id", "age"])  # Trier par patient_id et âge


data['patient_id'] = data['patient_id'].astype(int)
# Ajouter l'index de mesure pour chaque patient
data['index_measure'] = data.groupby('patient_id').cumcount() + 1
print(data.shape)

fixed_features = ['cohort', 'sexM', 'gene', 'age_at_diagnosis']
temp_features = ['age', 'ledd', 'time_since_intake_on', 'time_since_intake_off', 'on', 'off', 'on_off_ratio', 
                 'off_cumavg', 'on_cumavg', 'off_lag1', 'on_lag1', 'off_ewma', 'index_measure']
features = fixed_features + temp_features

# Normalisation des variables temporelles
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])
data.head(2)

(79275, 18)


Unnamed: 0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off,on_off_ratio,off_cumavg,on_cumavg,off_lag1,on_lag1,off_ewma,index_measure
61660,0,-0.349128,0.812747,0.612279,-0.196429,-0.36638,-0.455251,1.989774,0.153672,-0.215453,0.371074,-0.279301,0.684973,-0.007987,-0.474057,-1.142693,0.433334,-1.289041
61661,0,-0.349128,0.812747,0.612279,-0.196429,-0.296457,-0.203599,1.090589,0.178718,-0.790243,-0.080819,-0.341219,0.428951,-0.361674,0.43253,-0.170967,0.119591,-0.985394


In [101]:
Y_train= pd.read_csv('/kaggle/input/datasets-row/y_train_lXj6X5y.csv',index_col=0)
X_train= pd.read_csv('/kaggle/input/datasets-row/X_train_6ZIKlTY.csv', index_col=0)
X_test= pd.read_csv('/kaggle/input/datasets-row/X_test_oiZ2ukx.csv',index_col=0)
X_test.index = X_test.index + X_train.shape[0]  # Décale les index de X_test

print(Y_train.shape)
X_train_filled = data.loc[X_train.index]
Y_train = Y_train
print(X_train_filled.shape)
X_train_filled = pd.concat([X_train_filled, Y_train], axis = 1)
X_train_filled.head(3)

(55603, 1)
(55603, 18)


Unnamed: 0_level_0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off,on_off_ratio,off_cumavg,on_cumavg,off_lag1,on_lag1,off_ewma,index_measure,target
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,3332,-0.349128,-1.230396,-0.579843,-0.758273,-0.899541,0.07709,-0.033392,0.503441,-1.173437,0.520783,-0.470384,0.85461,-1.186945,0.035112,-0.559846,0.589245,-1.289041,34.7
1,3332,-0.349128,-1.230396,-0.579843,-0.758273,-0.820878,0.362618,-0.033392,2.604987,-0.694445,0.859732,-0.408921,1.046644,-0.892206,0.583761,-1.143164,0.824572,-0.985394,38.1
2,3332,-0.349128,-1.230396,-0.579843,-0.758273,-0.742215,0.609431,-0.820179,0.576582,-1.269235,0.586436,-0.490273,1.00743,-1.029751,0.926159,-0.657065,0.713269,-0.681747,41.6


In [102]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

class PatientDataset(Dataset):
    def __init__(self, df, fixed_features, temp_features, target_col):
        self.df = df
        self.fixed_features = fixed_features
        self.temp_features = temp_features
        self.target_col = target_col

        # Liste des (patient_id, index) pour chaque ligne du dataset
        self.index_mapping = [(pid, i) for pid, group in df.groupby("patient_id") for i in range(len(group))]

    def __len__(self):
        return len(self.index_mapping)

    def __getitem__(self, idx):
        patient_id, t = self.index_mapping[idx]
        patient_data = self.df[self.df["patient_id"] == patient_id]

        # Extraire les variables fixes (constantes pour un patient)
        x_fixed = torch.tensor(patient_data[self.fixed_features].iloc[0].values, dtype=torch.float32)

        # Extraire les variables temporelles jusqu'à l'instant t (t premières lignes)
        x_temp = torch.tensor(patient_data[self.temp_features].iloc[:t].values, dtype=torch.float32)

        # Si target_col est défini, on récupère la target
        if self.target_col is not None:
            y_target = torch.tensor(patient_data[self.target_col].iloc[t], dtype=torch.float32)
            return x_fixed, x_temp, y_target
        else:
            return x_fixed, x_temp

def collate_fn(batch):
    x_fixed_batch, x_temp_batch, y_target_batch = zip(*batch)

    x_fixed_batch = torch.stack(x_fixed_batch)  # (batch_size, num_fixed_features)
    y_target_batch = torch.tensor(y_target_batch, dtype=torch.float32)  # (batch_size,)

    x_temp_batch = pad_sequence(x_temp_batch, batch_first=True, padding_value=0)  # (batch_size, max_seq_len, num_temp_features)

    # Création du mask (True si padding)
    mask = (x_temp_batch == 0).all(dim=2)  # (batch_size, max_seq_len)

    return x_fixed_batch, x_temp_batch, y_target_batch, mask

dataset = PatientDataset(df=X_train_filled, fixed_features=fixed_features, temp_features=temp_features, target_col="target")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Vérification
#next(iter(dataloader))  # Test pour voir si ça fonctionne


In [103]:
import torch
import torch.nn as nn

class TimeSeriesPredictor(nn.Module):
    def __init__(self, num_temp_features, num_fixed_features, encoding_dim=16, num_heads=4, hidden_dim=64):
        super(TimeSeriesPredictor, self).__init__()

        # Embedding des variables fixes
        self.encoder_fixed = nn.Sequential(
            nn.Linear(num_fixed_features, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoding_dim)
        )

        # Embedding temporel via Transformer
        self.temporal_embedding = nn.Linear(num_temp_features, hidden_dim)
        self.transformer_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=2)
        self.encoder_fc = nn.Linear(hidden_dim, encoding_dim)

        # FiLM pour injecter les variables fixes
        self.film_scale = nn.Linear(encoding_dim, hidden_dim)  # Gamma
        self.film_shift = nn.Linear(encoding_dim, hidden_dim)  # Beta

        # Prédiction de la target
        self.target_pred_layer_1 = nn.Sequential(
            nn.Linear(2 * encoding_dim, hidden_dim),  # Concaténer fixed et temp encodés
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)  # Changer la sortie pour qu'elle soit de taille hidden_dim (64)
        )

        
        self.target_pred_layer_2 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),  # Concaténer fixed et temp encodés
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # Sortie = 1 seule valeur
        )

    def forward(self, x_fixed, x_temp, mask=None):
        # **Encoder les variables fixes**
        encoded_fixed = self.encoder_fixed(x_fixed)  # (batch, encoding_dim)

        # **Encoder les variables temporelles**
        temp_embedded = self.temporal_embedding(x_temp)  # (batch, seq_len, hidden_dim)
        
        # **Appliquer le Transformer avec un mask (pour ignorer le padding)**
        if mask is not None:
            temp_encoded = self.transformer(temp_embedded, src_key_padding_mask=mask)  # (batch, seq_len, hidden_dim)
        else:
            temp_encoded = self.transformer(temp_embedded)

        # **Récupérer la dernière représentation non-pad**
        seq_lengths = (~mask).sum(dim=1) - 1  # Index du dernier vrai token
        encoded_temp = torch.stack([temp_encoded[i, seq_lengths[i], :] for i in range(temp_encoded.size(0))])  # (batch, hidden_dim)
        encoded_temp = self.encoder_fc(encoded_temp)  # (batch, encoding_dim)

        # **Générer FiLM parameters (scale et shift)**
        gamma = self.film_scale(encoded_fixed).unsqueeze(1)  # (batch, 1, hidden_dim)
        beta = self.film_shift(encoded_fixed).unsqueeze(1)   # (batch, 1, hidden_dim)

        # **Modifier les représentations temporelles avec FiLM**
        temp_encoded = gamma * temp_encoded + beta  # (batch, seq_len, hidden_dim)

        # **Concaténer encoded_fixed et encoded_temp pour la prédiction**
        final_representation = torch.cat((encoded_fixed, encoded_temp), dim=1)

        # **Prédiction de la target**
        target_pred = self.target_pred_layer_1(final_representation).squeeze(1)  # (batch,)
        target_pred = self.target_pred_layer_2(target_pred).squeeze(1)  # (batch,)

        return target_pred


In [104]:
import torch.optim as optim

# Définir les paramètres
num_temp_features = len(temp_features)
num_fixed_features = len(fixed_features)

# Initialiser le modèle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TimeSeriesPredictor(num_temp_features, num_fixed_features).to(device)
model.load_state_dict(torch.load("best_model_3.pth"))

# Définir la perte et l'optimiseur
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

# Boucle d'entraînement
num_epochs = 20
best_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for x_fixed, x_temp, y_target, mask in dataloader:  # On récupère aussi le mask !
        # Envoyer sur GPU si dispo
        x_fixed, x_temp, y_target, mask = x_fixed.to(device), x_temp.to(device), y_target.to(device), mask.to(device)

        # Reset gradients
        optimizer.zero_grad()

        # Forward (ajout du mask !)
        y_pred = model(x_fixed, x_temp, mask)

        # Calcul de la perte
        loss = criterion(y_pred, y_target)

        # Backpropagation
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Sauvegarde du meilleur modèle
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), "best_model_3.pth")
        print("✅ Meilleur modèle sauvegardé !")


  model.load_state_dict(torch.load("best_model_3.pth"))


Epoch [1/20], Loss: 42.6289
✅ Meilleur modèle sauvegardé !
Epoch [2/20], Loss: 42.5096
✅ Meilleur modèle sauvegardé !
Epoch [3/20], Loss: 42.1597
✅ Meilleur modèle sauvegardé !
Epoch [4/20], Loss: 41.8717
✅ Meilleur modèle sauvegardé !
Epoch [5/20], Loss: 41.7009
✅ Meilleur modèle sauvegardé !
Epoch [6/20], Loss: 41.4844
✅ Meilleur modèle sauvegardé !
Epoch [7/20], Loss: 41.4654
✅ Meilleur modèle sauvegardé !
Epoch [8/20], Loss: 41.2146
✅ Meilleur modèle sauvegardé !


KeyboardInterrupt: 

## Inférence sur le testset

In [105]:
# Charger le modèle entraîné
model.load_state_dict(torch.load("best_model_3.pth"))
model.eval()  # Mode évaluation

# Charger les données
data_for_target = pd.read_csv("/kaggle/input/full-set-complete-v2-csv/full_set_complete_v2.csv").drop(columns=["Index"])
data_for_target = data_for_target.sort_values(by=["patient_id", "age"])  # Trier par patient_id et âge


data_for_target['patient_id'] = data_for_target['patient_id'].astype(int)
# Ajouter l'index de mesure pour chaque patient
data_for_target['index_measure'] = data_for_target.groupby('patient_id').cumcount() + 1

fixed_features = ['cohort', 'sexM', 'gene', 'age_at_diagnosis']
temp_features = ['age', 'ledd', 'time_since_intake_on', 'time_since_intake_off', 'on', 'off', 'on_off_ratio', 
                 'off_cumavg', 'on_cumavg', 'off_lag1', 'on_lag1', 'off_ewma', 'index_measure']
features = fixed_features + temp_features

# Normalisation des variables temporelles
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

data_for_target.head(3)

  model.load_state_dict(torch.load("best_model_3.pth"))


Unnamed: 0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off,on_off_ratio,off_cumavg,on_cumavg,off_lag1,on_lag1,off_ewma,index_measure
61660,0,0,1,2,54.5,58.2,497.0,3.7,14.310491,17.0,36.244167,0.456447,36.244167,17.0,22.0,7.004841,36.244167,1
61661,0,0,1,2,54.5,59.0,549.0,2.9,14.344101,11.0,29.071857,0.365791,32.658012,14.0,36.244167,17.0,31.462627,2
61662,0,0,1,2,54.5,59.6,581.0,1.1,14.5,29.0,43.0,0.659091,36.105341,19.0,29.071857,11.0,39.154209,3


In [106]:
class PatientPredictionDataset(Dataset):
    def __init__(self, df, fixed_features, temp_features):
        self.df = df
        self.fixed_features = fixed_features
        self.temp_features = temp_features

        # Liste des (patient_id, index) pour chaque ligne du dataset
        self.index_mapping = [(pid, i) for pid, group in df.groupby("patient_id") for i in range(len(group))]

    def __len__(self):
        return len(self.index_mapping)

    def __getitem__(self, idx):
        patient_id, t = self.index_mapping[idx]
        patient_data = self.df[self.df["patient_id"] == patient_id]

        # Extraire les variables fixes (constantes pour un patient)
        x_fixed = torch.tensor(patient_data[self.fixed_features].iloc[0].values, dtype=torch.float32)

        # Extraire les variables temporelles jusqu'à l'instant t (t premières lignes)
        x_temp = torch.tensor(patient_data[self.temp_features].iloc[:t].values, dtype=torch.float32)

        return x_fixed, x_temp  # Pas de y_target ici, car tu fais des prédictions

def collate_fn(batch):
    x_fixed_batch, x_temp_batch = zip(*batch)

    x_fixed_batch = torch.stack(x_fixed_batch)  # (batch_size, num_fixed_features)

    x_temp_batch = pad_sequence(x_temp_batch, batch_first=True, padding_value=0)  # (batch_size, max_seq_len, num_temp_features)

    # Création du mask (True si padding)
    mask = (x_temp_batch == 0).all(dim=2)  # (batch_size, max_seq_len)

    return x_fixed_batch, x_temp_batch, mask



In [107]:
# Créer un DataLoader pour data_for_target
prediction_dataset = PatientPredictionDataset(df=data_for_target, fixed_features=fixed_features, temp_features=temp_features)
prediction_dataloader = DataLoader(prediction_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
#next(iter(prediction_dataloader))

In [108]:
# Effectuer les prédictions
model.eval()
predictions = []

with torch.no_grad():
    for x_fixed, x_temp, mask in prediction_dataloader:

        x_fixed, x_temp, mask = x_fixed.to(device), x_temp.to(device), mask.to(device)
        
        # Passer les données dans le modèle
        y_pred = model(x_fixed, x_temp, mask)

        # Ajouter les prédictions à la liste
        predictions.append(y_pred)

# Convertir la liste des prédictions en un seul tensor
predictions_tensor = torch.cat(predictions, dim=0)

In [109]:
print(len(data_for_target), len(predictions_tensor.cpu().numpy()))

79275 79275


In [110]:
data_for_target.head(3)

Unnamed: 0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off,on_off_ratio,off_cumavg,on_cumavg,off_lag1,on_lag1,off_ewma,index_measure
61660,0,0,1,2,54.5,58.2,497.0,3.7,14.310491,17.0,36.244167,0.456447,36.244167,17.0,22.0,7.004841,36.244167,1
61661,0,0,1,2,54.5,59.0,549.0,2.9,14.344101,11.0,29.071857,0.365791,32.658012,14.0,36.244167,17.0,31.462627,2
61662,0,0,1,2,54.5,59.6,581.0,1.1,14.5,29.0,43.0,0.659091,36.105341,19.0,29.071857,11.0,39.154209,3


In [111]:
# Ajouter les prédictions au DataFrame
data_for_target["estimated_target"] = predictions_tensor.cpu().numpy()

In [112]:
data_for_target = data_for_target.sort_index()
data_for_target.head(3)

Unnamed: 0,patient_id,cohort,sexM,gene,age_at_diagnosis,age,ledd,time_since_intake_on,time_since_intake_off,on,off,on_off_ratio,off_cumavg,on_cumavg,off_lag1,on_lag1,off_ewma,index_measure,estimated_target
0,3332,0,0,1,48.5,52.1,607.0,1.9,14.779859,7.0,38.620296,0.176677,38.620296,7.0,30.0,13.0,38.620296,1,117.991959
1,3332,0,0,1,48.5,53.0,666.0,1.9,17.6,12.0,44.0,0.266667,41.310148,9.5,38.620296,7.0,42.206765,2,143.792313
2,3332,0,0,1,48.5,53.9,717.0,1.2,14.87801,6.0,39.662327,0.147557,40.760874,8.333333,44.0,12.0,40.510473,3,143.66864


In [113]:
data_for_target.to_csv('data_with_estimated_target.csv')