<a href="https://colab.research.google.com/github/SAHIL9581/w2w/blob/main/W2W_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import os
import textwrap

# --- 1. SETUP THE TEMPORARY ENVIRONMENT AND WORKSPACE ---
PROJECT_PATH = "/content/W2W_Pipeline_Local"
print(f"--> Creating a temporary project workspace at: {PROJECT_PATH}")

os.makedirs(f"{PROJECT_PATH}/src", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/data/raw_las_files", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/artifacts", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/trained_models/autoencoder", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/trained_models/boundary_detector", exist_ok=True)

# Change the current working directory to the project path
%cd {PROJECT_PATH}
print(f"--> Successfully changed directory to: {os.getcwd()}")


# --- 2. INSTALL ALL REQUIRED LIBRARIES ---
print("\n--> Installing all necessary Python libraries (this may take a few minutes)...")
# Ensure latest versions and no conflicts
!pip install -U "ray[train,tune]>=2.9.0" mlflow torch torchvision torchaudio lasio scikit-learn pandas tqdm matplotlib joblib pyyaml pyngrok -q
print("✅ Library installation complete.")

--> Creating a temporary project workspace at: /content/W2W_Pipeline_Local
/content/W2W_Pipeline_Local
--> Successfully changed directory to: /content/W2W_Pipeline_Local

--> Installing all necessary Python libraries (this may take a few minutes)...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m821.2/821.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.7/897.7 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [21]:
import textwrap
import os

print("--> Creating all project source files...")

# --- config.yaml ---
with open("config.yaml", "w") as f:
    f.write(textwrap.dedent("""
    run_data_preparation: true
    run_pretraining: true
    run_finetuning: true
    run_inference: true
    paths:
      raw_las_folder: "data/raw_las_files/"
      processed_csv_path: "data/train.csv"
      label_encoder_path: "artifacts/label_encoder.json"
      std_scaler_path: "artifacts/StandardScaler.bin"
      pretrained_encoder_path: "trained_models/autoencoder/best_autoencoder.pt"
      final_model_path: "trained_models/boundary_detector/final_model.pt"
    mlflow:
      experiment_name: "W2W_Matcher_Pipeline"
    pretraining:
      epochs: 25
      num_samples: 10
      search_space:
        in_channels: 13
        optimizer: ["RMSprop", "AdamW", "Adam"]
        lr: [0.001, 0.0001]
        act_name: ["prelu", "relu"]
        batch_size: [16, 32]
    finetuning:
      learning_rate: 0.0001
      batch_size: 16
      epochs: 100
      model_params: {patch_height: 700, in_channels: 13, act_name: "prelu", project_in_features: 2048, hidden_dim: 256, num_queries: 100, num_heads: 8, dropout: 0.1, expansion_factor: 4, num_transformers: 6, output_size: 3}
      matcher_costs: {set_cost_class: 1, set_cost_bbox: 5}
      loss_weights: {loss_matching: 1.0, loss_unmatching: 0.5, loss_height_constraint: 0.5}
    inference:
      reference_well: "WELL_NAME_A"
      well_of_interest: "WELL_NAME_B"
      correlation_threshold: 0.7
    """))

# --- ALL OTHER SCRIPTS ---
# Create src directory if it doesn't exist
os.makedirs("src", exist_ok=True)

with open("src/__init__.py", "w") as f: f.write("# Makes this a package\n")
with open("src/prepare_data.py", "w") as f: f.write("import pandas as pd,numpy as np,lasio,os,json\nfrom sklearn.preprocessing import StandardScaler\nfrom joblib import dump\ndef run_data_preparation(config):\n    print(\"--- LAUNCHING PIPELINE 0: DATA PREPARATION ---\")\n    paths = config['paths']\n    search_folder = paths['raw_las_folder']\n    items_in_folder = os.listdir(search_folder)\n    # Check if there's a single sub-folder directly inside raw_las_files\n    if len(items_in_folder) == 1 and os.path.isdir(os.path.join(search_folder, items_in_folder[0])):\n        print(f\"--> Found single sub-folder '{items_in_folder[0]}'. Adjusting search path.\")\n        search_folder = os.path.join(search_folder, items_in_folder[0])\n    all_wells_df, las_files_found = [], []\n    print(f\"--> Searching for .las files in '{search_folder}'...\")\n    for root, dirs, files in os.walk(search_folder):\n        for file in files:\n            if file.lower().endswith('.las'): las_files_found.append(os.path.join(root, file))\n    if not las_files_found: raise FileNotFoundError(f\"No .las files found in '{search_folder}'. Check your ZIP or folder structure.\")\n    print(f\"--> Found {len(las_files_found)} .las files. Reading now...\")\n    for filepath in las_files_found:\n        try:\n            las = lasio.read(filepath); df = las.df().reset_index()\n            df['WELL'] = las.well.WELL.value if las.well.WELL.value else os.path.splitext(os.path.basename(filepath))[0]; df['GROUP'] = 'UNKNOWN'\n            for param in las.params:\n                if 'GROUP' in param.mnemonic: df['GROUP'] = param.value; break\n            all_wells_df.append(df)\n        except Exception as e: print(f\"    - Could not read {filepath}: {e}\")\n    master_df = pd.concat(all_wells_df, ignore_index=True)\n    if 'DEPT' in master_df.columns: master_df.rename(columns={'DEPT':'DEPTH_MD'}, inplace=True)\n    master_df.to_csv(paths['processed_csv_path'], index=False, sep=';'); print(f\"--> Saved combined data to '{paths['processed_csv_path']}'\")\n    label_encoder={str(g):i for i, g in enumerate(master_df['GROUP'].unique())}\n    with open(paths['label_encoder_path'], 'w') as f: json.dump(label_encoder, f, indent=4)\n    print(f\"--> Saved label encoder to '{paths['label_encoder_path']}'\")\n    numeric_df = master_df.drop(columns=['WELL', 'GROUP', 'DEPTH_MD'], errors='ignore')\n    scaler = StandardScaler(); scaler.fit(numeric_df.fillna(0)); dump(scaler, paths['std_scaler_path'])\n    print(f\"--> Saved StandardScaler to '{paths['std_scaler_path']}'\"); print(\"✅ Data Preparation complete.\")\n")
with open("src/utils.py", "w") as f: f.write("import torch\ndef collate_fn(batch):images,t=zip(*batch);return torch.stack(images),list(t)\n")
with open("src/dataset_pretrain.py", "w") as f: f.write("import numpy as np,torch,pandas as pd\nfrom torch.utils import data\nfrom joblib import load\nclass AutoencoderDataset(data.Dataset):\n    def __init__(self,c):\n        p=c['paths'];df=pd.read_csv(p['processed_csv_path'],delimiter=';')\n        cols_drop=['WELL','GROUP','DEPTH_MD']\n        df.drop(columns=cols_drop,inplace=True,errors='ignore');df.fillna(0,inplace=True)\n        scaler=load(p['std_scaler_path']);self.data=scaler.transform(df).astype(np.float32)\n    def __len__(self):return len(self.data)\n    def __getitem__(self,i):s=self.data[i];return torch.from_numpy(s),torch.from_numpy(s)\n")
with open("src/dataset_finetune.py", "w") as f: f.write("import json,numpy as np,torch,pandas as pd\nfrom torch.utils import data\nfrom joblib import load\nclass BoundaryDataset(data.Dataset):\n    def __init__(self,c,seed=None):\n        self.p=c['finetuning']['model_params'];self.d=c['paths'];self.s=seed if seed else np.random.randint(2**32-1)\n        x,y=self.get_Xy();self.x=x;self.gt=y\n    def load_df(self,p,d=';'):return pd.read_csv(p,delimiter=d)\n    def get_rand_well(self,d,s):np.random.seed(s);names=list(d.WELL.unique());idx=np.random.randint(0,len(names));return d[d['WELL']==names[idx]].copy()\n    def get_gt_b(self,y_l):\n        gts=[];\n        for n,y in enumerate(y_l):\n            gt,c={},0;k=[i+1 for i in range(len(y)-1) if not y[i]==y[i+1]];k.insert(0,0);gp=[y[idx] for idx in k];top=k.copy();k.append(len(y));h=[e1-e2 for(e1,e2) in zip(k[1:],k[:-1])]\n            for t,h_val,g in zip(top,h,gp):gt[c]={'Group':int(g),'Top':int(t),'Height':int(h_val)};c+=1\n            gts.append(gt)\n        return gts\n    def get_Xy(self):\n        d=self.load_df(self.d['processed_csv_path']);w=self.get_rand_well(d,self.s)\n        with open(self.d['label_encoder_path']) as f:le=json.load(f)\n        w.loc[:,'GROUP']=w['GROUP'].astype(str).map(le).bfill().ffill()\n        lbl=w['GROUP'].copy()\n        cols_drop=['WELL','GROUP','DEPTH_MD']\n        w_numeric=w.drop(columns=cols_drop,errors='ignore');w_numeric.fillna(0,inplace=True)\n        scaler=load(self.d['std_scaler_path']);s_d=scaler.transform(w_numeric)\n        ph=self.p['patch_height'];idx=list(range(0,s_d.shape[0],ph))\n        x=np.asarray([s_d[i:i+ph] for i in idx if s_d[i:i+ph].shape[0]==ph]).astype(np.float32)\n        y=np.asarray([lbl.values[i:i+ph] for i in idx if lbl.values[i:i+ph].shape[0]==ph])\n        return x,self.get_gt_b(y)\n    def __len__(self):return len(self.x)\n    def __getitem__(self,idx):\n        img=np.expand_dims(self.x[idx],0);data=self.gt[idx];lbl,top,h=[],[],[]\n        for i in data:top.append(data[i]['Top']/self.p['patch_height']);h.append(data[i]['Height']/self.p['patch_height']);lbl.append(1)\n        tgt={};tgt['labels']=torch.tensor(lbl,dtype=torch.long);t,h_v=torch.tensor(top,dtype=torch.float32).view(-1,1),torch.tensor(h,dtype=torch.float32).view(-1,1)\n        tgt['loc_info']=torch.hstack((t,h_v));return torch.from_numpy(img),tgt\n")
with open("src/model.py", "w") as f: f.write("import torch,torch.nn as nn\ndef get_activation(name): return nn.PReLU() if name=='prelu' else nn.ReLU() if name=='relu' else nn.GELU()\nclass Project(nn.Module):\n    def __init__(self,i,o): super().__init__(); self.l=nn.Linear(i,o)\n    def forward(self,x): return self.l(x.flatten(1))\nclass Query(nn.Module):\n    def __init__(self,s,d): super().__init__(); self.q=nn.Parameter(torch.randn(1,s,d))\n    def forward(self,x): return self.q.repeat(x.shape[0],1,1)\nclass Transformer(nn.Module):\n    def __init__(self,i,n,d,e,a): super().__init__(); self.t=nn.TransformerEncoderLayer(d_model=i,nhead=n,dropout=d,batch_first=True)\n    def forward(self,q,c): return self.t(q)\nclass Block(nn.Module):\n    def __init__(self,i,o,s=2,k=3,a='prelu'):\n        super().__init__(); p=k//2; self.act=get_activation(a); self.b=nn.Sequential(nn.Conv2d(i,o,k,s,p),nn.BatchNorm2d(o),self.act,nn.Conv2d(o,o,k,1,p),nn.BatchNorm2d(o),self.act)\n    def forward(self,x): return self.b(x)\nclass UNet(nn.Module):\n    def __init__(self,in_channels=13,activation='prelu'):\n        super().__init__();self.act=get_activation(activation);self.start=nn.Sequential(nn.Conv2d(in_channels,32,3,1,1),nn.BatchNorm2d(32),self.act);self.e1=Block(32,64,2,a=activation);self.e2=Block(64,128,2,a=activation);self.e3=Block(128,256,2,a=activation);self.mid=nn.Sequential(nn.Conv2d(256,512,2),nn.BatchNorm2d(512),self.act);self.uc3=nn.ConvTranspose2d(512,256,2,2);self.d3=Block(512,256,1,a=activation);self.uc2=nn.ConvTranspose2d(256,128,2,2);self.d2=Block(256,128,1,a=activation);self.uc1=nn.ConvTranspose2d(128,64,2,2);self.d1=Block(128,64,1,a=activation);self.out=nn.Conv2d(64,in_channels,1)\n    def forward(self,x):\n        x=x.unsqueeze(-1).unsqueeze(-1);x1=self.e1(self.start(x));x2=self.e2(x1);x3=self.e3(x2);m=self.mid(x3);u3=self.d3(torch.cat((self.uc3(m,output_size=x3.size()),x3),1));u2=self.d2(torch.cat((self.uc2(u3,output_size=x2.size()),x2),1));u1=self.d1(torch.cat((self.uc1(u2,output_size=x1.size()),x1),1));return self.out(u1).squeeze(-1).squeeze(-1)\nclass UNetEncoder(nn.Module):\n    def __init__(self,in_channels=13,activation='prelu'):\n        super().__init__();self.act=get_activation(activation);self.start=nn.Sequential(nn.Conv2d(in_channels,32,3,1,1),nn.BatchNorm2d(32),self.act);self.e1=Block(32,64,2,a=activation);self.e2=Block(64,128,2,a=activation);self.e3=Block(128,256,2,a=activation);self.mid=nn.Sequential(nn.Conv2d(256,512,2),nn.BatchNorm2d(512),self.act)\n    def forward(self,x):x=x.unsqueeze(-1);x=x.permute(0,2,1);x=x.unsqueeze(-1);x1=self.e1(self.start(x));x2=self.e2(x1);x3=self.e3(x2);m=self.mid(x3);return m\nclass W2WTransformerModel(nn.Module):\n    def __init__(self,c):\n        super().__init__();p=c['finetuning']['model_params'];self.encoder=UNetEncoder(p['in_channels'],p['act_name']);self.project=Project(p['project_in_features'],p['hidden_dim']);self.query=Query(p['num_queries'],p['hidden_dim']);self.transformers=nn.ModuleList([Transformer(p['hidden_dim'],p['num_heads'],p['dropout'],p['expansion_factor'],p['act_name'])for _ in range(p['num_transformers'])]);self.finalize=nn.Sequential(nn.Linear(p['hidden_dim'],p['output_size']),get_activation(p['act_name']),nn.LayerNorm(p['output_size']))\n    def forward(self,img):\n        seq=self.project(self.encoder(img));q=self.query(seq)\n        for t in self.transformers:q=t(q,seq)\n        return self.finalize(q)\ndef load_pretrained_encoder_weights(model,path):\n    pre_dict=torch.load(path);model_dict=model.state_dict()\n    enc_dict={k.replace('module.',''):v for k,v in pre_dict.items()if any(x in k for x in ['e1','e2','e3','mid','start'])}\n    enc_dict={'encoder.'+k:v for k,v in enc_dict.items()};model_dict.update(enc_dict)\n    model.load_state_dict(model_dict, strict=False);print(f\"✅ Loaded {len(enc_dict)} pre-trained layers from {path}\");return model\n")
with open("src/matcher.py", "w") as f: f.write("import torch;from scipy.optimize import linear_sum_assignment;from torch import nn\nclass HungarianMatcher(nn.Module):\n    def __init__(self,c_cls=1,c_bbox=1):super().__init__();self.c_cls=c_cls;self.c_bbox=c_bbox\n    @torch.no_grad()\n    def forward(self,o,t):\n        l,i=o[:,:,:1],o[:,:,1:];bs,nq=l.shape[:2];op,ob=l.flatten(0,1).sigmoid(),i.flatten(0,1)\n        ti,tb=torch.cat([v[\"labels\"]for v in t]).to(op.device),torch.cat([v[\"loc_info\"]for v in t]).to(ob.device)\n        cc=-op[:,0];cb=torch.cdist(ob,tb,p=1);C=(self.c_bbox*cb+self.c_cls*cc).view(bs,nq,-1).cpu()\n        s=[len(v[\"loc_info\"])for v in t];idx=[linear_sum_assignment(c[i])for i,c in enumerate(C.split(s,-1))]\n        return [(torch.as_tensor(i,dtype=torch.int64),torch.as_tensor(j,dtype=torch.int64))for i,j in idx]\ndef build_matcher(c):p=c['finetuning']['matcher_costs'];return HungarianMatcher(p['set_cost_class'],p['set_cost_bbox'])\n")
with open("src/loss.py", "w") as f: f.write("import torch,torch.nn as nn;from torch.nn import functional as F;from src.matcher import build_matcher\nclass SetCriterion(nn.Module):\n    def __init__(self,c):super().__init__();self.m=build_matcher(c);self.l_names=[\"loss_matching\",\"loss_unmatching\",\"loss_height_constraint\"];self.nq=c['finetuning']['model_params']['num_queries'];self.w=c['finetuning']['loss_weights']\n    def loss_match(self,o,t,idx):i=self._get_src_p_idx(idx);sb=o[i];tb=torch.cat([t[\"loc_info\"][j]for t,(_,j)in zip(t,idx)],0);tb_c=torch.hstack([torch.ones_like(tb[:,:1]),tb]);return{'loss_matching':F.l1_loss(sb,tb_c)}\n    def loss_unmatch(self,o,t,idx):un_idx=[];[un_idx.append(torch.where(torch.ones(self.nq,dtype=torch.bool))[0]) for i,(s,_) in enumerate(idx)];un_preds=torch.cat([out[ui,0]for out,ui in zip(o,un_idx)]);return{'loss_unmatching':un_preds.mean()}\n    def loss_height(self,o,t,idx):lhc=sum([abs(ht[i].sum()-1)for ht,(i,_)in zip(o[:,:,2],idx)])/o.shape[0];return{'loss_height_constraint':lhc}\n    def _get_src_p_idx(self,i):b=torch.cat([torch.full_like(s,k)for k,(s,_)in enumerate(i)]);s=torch.cat([s for(s,_)in i]);return b,s\n    def get_loss(self,ln,o,t,i):return getattr(self,ln)(o,t,i)\n    def forward(self,o,t):i=self.m(o,t);losses={};[losses.update(self.get_loss(ln,o,t,i)) for ln in self.l_names];return losses\n")
with open("pretrain_autoencoder.py", "w") as f: f.write(textwrap.dedent("""
    import torch
    from torch.utils.data import DataLoader
    from torch.nn import MSELoss
    from ray import train
    from ray.train.torch import TorchCheckpoint
    from src.dataset_pretrain import AutoencoderDataset
    from src.model import UNet

    def train_loop_per_worker(config):
        model = UNet(in_channels=config['in_channels'], activation=config['act_name'])
        criterion = MSELoss()
        optimizer_class = getattr(torch.optim, config['optimizer'])
        optimizer = optimizer_class(model.parameters(), lr=config['lr'])
        train_dataset = AutoencoderDataset(config)
        train_dataloader = DataLoader(train_dataset, batch_size=int(config['batch_size']))
        model, train_dataloader = train.torch.prepare(model, train_dataloader)
        for epoch in range(config['epochs']):
            model.train(); running_loss = 0.0
            for i, (image, _) in enumerate(train_dataloader):
                outputs = model(image); loss = criterion(outputs, image)
                optimizer.zero_grad(); loss.backward(); optimizer.step()
                running_loss += loss.item() * image.size(0)
            epoch_loss = running_loss / len(train_dataset)
            checkpoint = TorchCheckpoint.from_model(model=model.module)
            train.report({"loss": epoch_loss}, checkpoint=checkpoint)
    """))
with open("train_boundary_detector.py", "w") as f: f.write(textwrap.dedent("""
    import torch, os
    from torch.utils.data import DataLoader
    from tqdm import tqdm
    from src.dataset_finetune import BoundaryDataset
    from src.model import W2WTransformerModel, load_pretrained_encoder_weights
    from src.loss import SetCriterion
    from src.utils import collate_fn

    def run_finetuning(config):
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"); ft_params=config['finetuning']
        loader=DataLoader(BoundaryDataset(config,seed=42), batch_size=ft_params['batch_size'], shuffle=True, collate_fn=collate_fn)
        model=W2WTransformerModel(config).to(device)
        model=load_pretrained_encoder_weights(model, config['paths']['pretrained_encoder_path'])
        criterion=SetCriterion(config).to(device); optimizer=torch.optim.AdamW(model.parameters(), lr=ft_params['learning_rate']); weight_dict=criterion.w
        for epoch in range(ft_params['epochs']):
            model.train(); total_loss=0; progress_bar=tqdm(loader, desc=f"Epoch {epoch+1}/{ft_params['epochs']}")
            for images, targets in progress_bar:
                images, targets = images.to(device), [{k:v.to(device) for k, v in t.items()} for t in targets]
                outputs=model(images); loss_dict=criterion(outputs, targets); losses=sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
                optimizer.zero_grad(); losses.backward(); optimizer.step(); total_loss += losses.item()
                progress_bar.set_postfix({'loss': f"{losses.item():.4f}"})
            print(f"Epoch {epoch+1} Average Loss: {total_loss / len(loader):.4f}")
        os.makedirs(os.path.dirname(config['paths']['final_model_path']), exist_ok=True)
        torch.save(model.state_dict(), config['paths']['final_model_path'])
        print(f"✅ Final model saved to {config['paths']['final_model_path']}")
    """))
with open("run_inference.py", "w") as f: f.write(textwrap.dedent("""
    import pandas as pd, numpy as np, json, os
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches
    def plot_well_correlation(well1_name, well2_name, w1_layers, w2_layers, sim_matrix, threshold, output_path):
        fig, ax = plt.subplots(figsize=(10, 12)); cmap = plt.get_cmap('viridis')
        if not w1_layers or not w2_layers: print("Warning: One or both wells have no layers to plot."); return
        max_depth = max(w1_layers[-1]['bottom'], w2_layers[-1]['bottom']) if w1_layers and w2_layers else 1000 # Default if layers are empty
        ax.set_ylim(max_depth + 50, -50); ax.set_xlim(-0.5, 2.5)
        for i, l in enumerate(w1_layers): ax.add_patch(patches.Rectangle((0, l['top']), 1, l['height'], edgecolor='black', facecolor=cmap(l.get('group_id', 0) / (len(w1_layers) if len(w1_layers)>0 else 1)), alpha=0.6))
        for i, l in enumerate(w2_layers): ax.add_patch(patches.Rectangle((1.5, l['top']), 1, l['height'], edgecolor='black', facecolor=cmap(l.get('group_id', 0) / (len(w2_layers) if len(w2_layers)>0 else 1)), alpha=0.6))
        for i, row in enumerate(sim_matrix):
            for j, sim in enumerate(row):
                if sim >= threshold:
                    p = patches.Polygon([[1, w1_layers[i]['top']], [1, w1_layers[i]['bottom']], [1.5, w2_layers[j]['bottom']], [1.5, w2_layers[j]['top']]], facecolor=cmap(sim), alpha=0.4)
                    ax.add_patch(p)
        ax.set_xticks([0.5, 2]); ax.set_xticklabels([well1_name, well2_name], fontsize=14)
        ax.set_ylabel("Depth", fontsize=12); ax.set_title("Well to Well Correlation", fontsize=16); plt.grid(True, axis='y', linestyle='--'); plt.savefig(output_path); plt.close()
        print(f"--> Correlation plot saved to {output_path}")
    def run_correlation(config):
        inf, p = config['inference'], config['paths']; full_data = pd.read_csv(p['processed_csv_path'], delimiter=';'); ref_df = full_data[full_data['WELL'] == inf['reference_well']]; woi_df = full_data[full_data['WELL'] == inf['well_of_interest']]
        if ref_df.empty or woi_df.empty: print(f"Error: One or both wells not found: {inf['reference_well']}, {inf['well_of_interest']}. Check names in config.yaml or your data."); return
        with open(p['label_encoder_path']) as f: label_encoder = json.load(f)
        def get_true_layers(df, label_map):
            df=df.copy().reset_index(drop=True); df['group_id']=df['GROUP'].astype(str).map(label_map).fillna(-1)
            diffs=df['group_id'].diff().ne(0); layer_indices=np.concatenate(([0],df.index[diffs].values,[len(df)-1]))
            layers=[{'top':df['DEPTH_MD'].iloc[layer_indices[i]],'bottom':df['DEPTH_MD'].iloc[layer_indices[i+1]-1],'height':df['DEPTH_MD'].iloc[layer_indices[i+1]-1]-df['DEPTH_MD'].iloc[layer_indices[i]],'group_id':df['group_id'].iloc[layer_indices[i]]} for i in range(len(layer_indices)-1) if layer_indices[i]<layer_indices[i+1]]
            return layers
        ref_layers=get_true_layers(ref_df,label_encoder); woi_layers=get_true_layers(woi_df,label_encoder)
        sim_matrix=np.zeros((len(ref_layers),len(woi_layers)))
        for i, l1 in enumerate(ref_layers):
            for j, l2 in enumerate(woi_layers):
                if l1.get('group_id')==l2.get('group_id') and l1.get('group_id')!=-1: sim_matrix[i,j]=np.random.uniform(0.8,0.95)
                else: sim_matrix[i,j]=np.random.uniform(0.1,0.4)
        print("--> MOCK INFERENCE: Using ground truth layers for visualization demonstration.")
        plot_well_correlation(inf['reference_well'],inf['well_of_interest'],ref_layers,woi_layers,sim_matrix,inf['correlation_threshold'],'well_correlation_plot.png')
    """))

# THIS IS THE CORRECTED main.py SCRIPT
with open("main.py", "w") as f: f.write(textwrap.dedent("""
    import yaml, argparse, os, shutil, torch, mlflow
    from ray import tune
    from ray.train import RunConfig, ScalingConfig
    from ray.train.torch import TorchTrainer
    from ray.air.integrations.mlflow import MLflowLoggerCallback
    from src.prepare_data import run_data_preparation
    from pretrain_autoencoder import train_loop_per_worker
    from train_boundary_detector import run_finetuning
    from run_inference import run_correlation

    def main(config_path):
        with open(config_path,'r') as file: config=yaml.safe_load(file)
        mlflow.set_experiment(config['mlflow']['experiment_name'])

        if config.get('run_data_preparation',False):
            run_data_preparation(config)
            print("\\n--- STAGE 0 COMPLETE ---")

        if config.get('run_pretraining',False):
            if not os.path.exists(config['paths']['processed_csv_path']):
                print("Error: 'processed_csv_path' not found. Run data prep first.")
                return
            print("\\n--- LAUNCHING PIPELINE 1: AUTOENCODER PRE-TRAINING (VIA RAY TUNE) ---")

            # 1. Define the search space for hyperparameters that will be tuned.
            param_space = config['pretraining']['search_space']

            # 2. Define the static configuration that is the SAME for all trials.
            train_loop_config = {
                "paths": config["paths"],
                "epochs": config["pretraining"]["epochs"]
            }

            # 3. Create the Tuner, which orchestrates the hyperparameter search.
            tuner = tune.Tuner(
                TorchTrainer(
                    train_loop_per_worker,
                    train_loop_config=train_loop_config,
                    scaling_config=ScalingConfig(use_gpu=torch.cuda.is_available(), num_workers=1),
                ),
                param_space=param_space,
                tune_config=tune.TuneConfig(
                    num_samples=config['pretraining']['num_samples'],
                    metric="loss",
                    mode="min",
                ),
                run_config=RunConfig(
                    name="Pre-training_Trial",
                    callbacks=[MLflowLoggerCallback(experiment_name=config['mlflow']['experiment_name'], save_artifact=True)],
                ),
            )

            results = tuner.fit()
            best_result = results.get_best_result(metric="loss", mode="min")

            if best_result and best_result.checkpoint:
                source_path = os.path.join(best_result.checkpoint.path, "model.pt")
                destination_path = config['paths']['pretrained_encoder_path']
                os.makedirs(os.path.dirname(destination_path), exist_ok=True)
                shutil.copy(source_path, destination_path)
                print(f"\\n🏆 Best trial found with validation loss: {best_result.metrics['loss']:.4f}")
                print(f"✅ Best pre-trained model saved to {destination_path}")
            else:
                print("⚠️ No best trial found or best trial had no checkpoint. Pre-training may have failed.")
            print("\\n--- STAGE 1 COMPLETE ---")

        if config.get('run_finetuning',False):
            if not os.path.exists(config['paths']['pretrained_encoder_path']):
                print("Error: 'pretrained_encoder_path' not found. Run pre-training first.")
                return
            with mlflow.start_run(run_name="Fine-tuning_Run") as run:
                print(f"\\n--- LAUNCHING PIPELINE 2: FINE-TUNING (MLflow Run ID: {run.info.run_id}) ---")
                mlflow.log_params(config['finetuning'])
                run_finetuning(config)
                mlflow.log_artifact(config['paths']['final_model_path'])
                print("\\n--- STAGE 2 COMPLETE ---")

        if config.get('run_inference',False):
            if not os.path.exists(config['paths']['final_model_path']):
                print("Error: 'final_model_path' not found. Run fine-tuning first.")
                return
            with mlflow.start_run(run_name="Inference_Correlation_Run") as run:
                print(f"\\n--- LAUNCHING PIPELINE 3: WELL-TO-WELL INFERENCE (MLflow Run ID: {run.info.run_id}) ---")
                mlflow.log_params(config['inference'])
                run_correlation(config)
                mlflow.log_artifact('well_correlation_plot.png')
                print("\\n--- STAGE 3 COMPLETE ---")
        print("\\n✅ All requested pipeline stages finished.")

    if __name__=="__main__":
        parser=argparse.ArgumentParser()
        parser.add_argument('--config', type=str, default='config.yaml')
        args, unknown = parser.parse_known_args()
        main(args.config)
    """))

print("\n✅ All project files created successfully!")

--> Creating all project source files...

✅ All project files created successfully!


In [22]:
# After running the previous cell, execute this cell and then
# GO TO "Runtime" -> "Restart runtime..." in the Colab menu.
# Confirm the restart, and then proceed to the next cell.

print("!!! IMPORTANT: Please RESTART YOUR COLAB RUNTIME NOW !!!")
print("Go to 'Runtime' -> 'Restart runtime...' in the Colab menu.")
print("Once restarted, continue to the next cell.")

!!! IMPORTANT: Please RESTART YOUR COLAB RUNTIME NOW !!!
Go to 'Runtime' -> 'Restart runtime...' in the Colab menu.
Once restarted, continue to the next cell.


In [23]:
from google.colab import files
import os

print(">>> ACTION REQUIRED: Please upload the ZIP file containing your .las files.")
uploaded_files = files.upload()

if not uploaded_files:
    print("\n⚠️ Upload was cancelled or failed. Please run this cell again.")
elif len(uploaded_files) > 1:
    print("\n⚠️ Please upload only a single ZIP file. Run this cell again.")
else:
    zip_filename = list(uploaded_files.keys())[0]
    print(f"\n✅ '{zip_filename}' uploaded successfully.")

    print("--> Unzipping into the 'data/raw_las_files' folder...")
    # Use -o to overwrite existing files without prompting
    !unzip -q -o "{zip_filename}" -d data/raw_las_files/

    print("--> ZIP file has been unzipped successfully.")

    # Clean up the uploaded zip file
    os.remove(zip_filename)
    print("\n✅ Data input step is complete. You can now proceed to the next cell.")

>>> ACTION REQUIRED: Please upload the ZIP file containing your .las files.


Saving train.zip to train.zip

✅ 'train.zip' uploaded successfully.
--> Unzipping into the 'data/raw_las_files' folder...
--> ZIP file has been unzipped successfully.

✅ Data input step is complete. You can now proceed to the next cell.


In [24]:
#@title ⚙️ Configure Your Pipeline Run
#@markdown ### 1. Select Which Pipelines to Run
#@markdown Check the boxes for all stages you want to execute in this session.
run_data_preparation = True #@param {type:"boolean"}
run_pretraining = True #@param {type:"boolean"}
run_finetuning = True #@param {type:"boolean"}
run_inference = True #@param {type:"boolean"}

#@markdown ---
#@markdown ### 2. Configure Model and Data Parameters
#@markdown **Important:** Set the number of feature columns (curves) from your LAS files.
input_channels = 13 #@param {type:"integer"}

#@markdown ---
#@markdown **For the final Inference stage**, provide the exact names of the wells to compare.
#@markdown (You can find these in 'data/train.csv' after running data preparation).
reference_well = "WELL_NAME_A" #@param {type:"string"}
well_of_interest = "WELL_NAME_B" #@param {type:"string"}

# --- DO NOT EDIT THE CODE BELOW ---
import yaml

print("--> Reading existing config.yaml file...")
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("--> Applying your configuration settings...")

config['run_data_preparation'] = run_data_preparation
config['run_pretraining'] = run_pretraining
config['run_finetuning'] = run_finetuning
config['run_inference'] = run_inference

# Correctly place in_channels inside the search_space for pretraining and model_params for finetuning
config['pretraining']['search_space']['in_channels'] = input_channels
config['finetuning']['model_params']['in_channels'] = input_channels
config['inference']['reference_well'] = reference_well
config['inference']['well_of_interest'] = well_of_interest

with open('config.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print("--> Successfully updated config.yaml with your settings.")
print("\n✅ Configuration complete. You are ready to run the main pipeline.")

--> Reading existing config.yaml file...
--> Applying your configuration settings...
--> Successfully updated config.yaml with your settings.

✅ Configuration complete. You are ready to run the main pipeline.


In [25]:
#@title 🚀 Run Pipeline & Launch MLflow UI
#@markdown 1. **(One-time setup)** Go to https://dashboard.ngrok.com/get-started/your-authtoken
#@markdown 2. Copy your authtoken and paste it below.
#@markdown 3. Run this cell to execute the pipeline and view the results.

ngrok_auth_token = "2zsOgKyrXbZYrtq5ojgMgwih7AQ_4QwzvhaUh1PL7MYgYB9nY" #@param {type:"string"}

# --- DO NOT EDIT THE CODE BELOW ---
from pyngrok import ngrok
import os
import signal
import subprocess
import time

# Authenticate ngrok
if "PASTE" not in ngrok_auth_token and ngrok_auth_token:
    ngrok.set_auth_token(ngrok_auth_token)
    print("✅ Ngrok token set successfully.")
else:
    print("⚠️ Ngrok token not set. UI will not launch. Please get a token from ngrok.com")

# Terminate previous ngrok/mlflow processes to ensure a clean start
print("--> Cleaning up previous processes...")
try:
    ngrok.kill()
except Exception:
    pass # No running tunnels to kill

# Find and kill any lingering mlflow processes
# Using subprocess.run for more control and error handling
try:
    # List all processes and filter for 'mlflow ui'
    output = subprocess.run(['ps', '-ax'], capture_output=True, text=True, check=True).stdout
    for line in output.splitlines():
        if 'mlflow ui' in line and 'grep' not in line:
            try:
                pid = int(line.split()[0])
                os.kill(pid, signal.SIGTERM) # Use SIGTERM for graceful shutdown
                print(f"    - Sent SIGTERM to lingering mlflow process (PID: {pid})")
                time.sleep(1) # Give it a moment to terminate
                # Check if process is still alive, if so, send SIGKILL
                if os.path.exists(f"/proc/{pid}"): # Linux-specific check
                    os.kill(pid, signal.SIGKILL)
                    print(f"    - Killed lingering mlflow process (PID: {pid}) with SIGKILL")
            except (ValueError, ProcessLookupError):
                pass # Process might have already died or PID parsing failed
except Exception as e:
    print(f"Warning: Could not clean up old MLflow processes. Error: {e}")


print("\n--- RUNNING THE MAIN PIPELINE SCRIPT ---")
# --- Run the main script ---
!python main.py
print("--- PIPELINE SCRIPT FINISHED ---")


# --- Launch the MLflow UI ---
print("\n--> Launching MLflow UI...")

# Start MLflow UI in the background
# Using subprocess.Popen for more robust background execution
mlflow_process = None
try:
    mlflow_process = subprocess.Popen(["mlflow", "ui", "--backend-store-uri", "mlruns/", "--port", "5000"],
                                      stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid)
    print("MLflow UI process started in background.")
    time.sleep(5) # Give MLflow UI some time to start up

    # Create a public URL to the MLflow UI
    public_url = ngrok.connect(5000)
    print("---------------------------------------------------------------")
    print(f"✅ MLflow UI is running. Click here: {public_url}")
    print("---------------------------------------------------------------")
except Exception as e:
    print(f"🚨 Error launching MLflow UI or Ngrok tunnel: {e}")
    if mlflow_process:
        print("MLflow process output (stdout):", mlflow_process.stdout.read().decode())
        print("MLflow process output (stderr):", mlflow_process.stderr.read().decode())

print("\n--- MLflow UI setup complete. You can now access the UI via the link above. ---")
print("Note: The MLflow UI will remain active as long as this Colab session is running or until you kill the process.")

✅ Ngrok token set successfully.
--> Cleaning up previous processes...
    - Sent SIGTERM to lingering mlflow process (PID: 4883)

--- RUNNING THE MAIN PIPELINE SCRIPT ---
--- LAUNCHING PIPELINE 0: DATA PREPARATION ---
--> Found single sub-folder 'Force_2020_all_wells_train_test_blind_hidden_final'. Adjusting search path.
--> Searching for .las files in 'data/raw_las_files/Force_2020_all_wells_train_test_blind_hidden_final'...
--> Found 118 .las files. Reading now...
--> Saved combined data to 'data/train.csv'
--> Saved label encoder to 'artifacts/label_encoder.json'
--> Saved StandardScaler to 'artifacts/StandardScaler.bin'
✅ Data Preparation complete.

--- STAGE 0 COMPLETE ---

--- LAUNCHING PIPELINE 1: AUTOENCODER PRE-TRAINING (VIA RAY TUNE) ---
2025-07-16 18:40:54,122	INFO worker.py:1917 -- Started a local Ray instance.
2025-07-16 18:40:57,509	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.