<a href="https://colab.research.google.com/github/SAHIL9581/w2w/blob/main/W2W_Pipeline_Orchestrator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# --- 1. SETUP THE TEMPORARY ENVIRONMENT AND WORKSPACE ---
import os
import textwrap

# We will work inside Colab's temporary storage.
# All work will be lost if the session ends.
PROJECT_PATH = "/content/W2W_Pipeline_Local"
print(f"--> Creating a temporary project workspace at: {PROJECT_PATH}")

# Create all necessary folders for the project
os.makedirs(f"{PROJECT_PATH}/src", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/data/raw_las_files", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/artifacts", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/trained_models/autoencoder", exist_ok=True)
os.makedirs(f"{PROJECT_PATH}/trained_models/boundary_detector", exist_ok=True)

# Navigate into the project directory. All commands will now run from here.
%cd {PROJECT_PATH}
print(f"--> Successfully changed directory to: {os.getcwd()}")


# --- 2. INSTALL ALL REQUIRED LIBRARIES ---
print("--> Installing all necessary Python libraries...")
!pip install pandas numpy torch scikit-learn pyyaml mlflow scipy joblib "ray[tune]" warmup_scheduler lasio matplotlib pyngrok -q
print("--> Library installation complete.")


# --- 3. CREATE ALL PROJECT SCRIPTS AND CONFIGURATION ---
print("--> Creating all project source files...")

# --- config.yaml ---
with open("config.yaml", "w") as f:
    f.write(textwrap.dedent("""
    run_data_preparation: true
    run_pretraining: true
    run_finetuning: true
    run_inference: true
    paths:
      raw_las_folder: "data/raw_las_files/"
      processed_csv_path: "data/train.csv"
      label_encoder_path: "artifacts/label_encoder.json"
      std_scaler_path: "artifacts/StandardScaler.bin"
      pretrained_encoder_path: "trained_models/autoencoder/best_autoencoder.pt"
      final_model_path: "trained_models/boundary_detector/final_model.pt"
    mlflow:
      experiment_name: "W2W_Matcher_Pipeline"
    pretraining:
      num_workers: 2
      num_samples: 10
      epochs: 25
      in_channels: 13
      search_space:
        optimizer: ["rmsprop", "adamw"]
        lr: [0.001, 0.0001]
        act_name: ["prelu", "relu"]
        batch_size: [16, 32]
    finetuning:
      learning_rate: 0.0001
      batch_size: 16
      epochs: 100
      model_params: {patch_height: 700, in_channels: 13, act_name: "prelu", project_in_features: 2048, hidden_dim: 256, num_queries: 100, num_heads: 8, dropout: 0.1, expansion_factor: 4, num_transformers: 6, output_size: 3}
      matcher_costs: {set_cost_class: 1, set_cost_bbox: 5}
      loss_weights: {loss_matching: 1.0, loss_unmatching: 0.5, loss_height_constraint: 0.5}
    inference:
      reference_well: "WELL_NAME_A"
      well_of_interest: "WELL_NAME_B"
      correlation_threshold: 0.7
    """))

# --- ALL OTHER SCRIPTS ---
with open("src/__init__.py", "w") as f: f.write("# Makes this a package\n")
with open("src/prepare_data.py", "w") as f: f.write("import pandas as pd,numpy as np,lasio,os,json\nfrom sklearn.preprocessing import StandardScaler\nfrom joblib import dump\ndef run_data_preparation(config):\n    print(\"--- LAUNCHING PIPELINE 0: DATA PREPARATION ---\")\n    paths,las_folder=config['paths'],paths['raw_las_folder']\n    all_wells_df,las_files=[], [f for f in os.listdir(las_folder) if f.lower().endswith('.las')]\n    if not las_files: raise FileNotFoundError(f\"No .las files found in '{las_folder}'.\")\n    print(f\"--> Reading {len(las_files)} .las files from '{las_folder}'...\")\n    for filename in las_files:\n        try:\n            las=lasio.read(os.path.join(las_folder,filename));df=las.df().reset_index()\n            df['WELL']=las.well.WELL.value if las.well.WELL.value else os.path.splitext(filename)[0];df['GROUP']='UNKNOWN'\n            for param in las.params:\n                if 'GROUP' in param.mnemonic: df['GROUP']=param.value;break\n            all_wells_df.append(df)\n        except Exception as e: print(f\"    - Could not read {filename}: {e}\")\n    master_df=pd.concat(all_wells_df,ignore_index=True)\n    if 'DEPT' in master_df.columns: master_df.rename(columns={'DEPT':'DEPTH_MD'},inplace=True)\n    master_df.to_csv(paths['processed_csv_path'],index=False,sep=';');print(f\"--> Saved combined data to '{paths['processed_csv_path']}'\")\n    label_encoder={str(g):i for i,g in enumerate(master_df['GROUP'].unique())}\n    with open(paths['label_encoder_path'],'w') as f: json.dump(label_encoder,f,indent=4)\n    print(f\"--> Saved label encoder to '{paths['label_encoder_path']}'\")\n    numeric_df=master_df.drop(columns=['WELL','GROUP','DEPTH_MD'],errors='ignore')\n    scaler=StandardScaler();scaler.fit(numeric_df.fillna(0));dump(scaler,paths['std_scaler_path'])\n    print(f\"--> Saved StandardScaler to '{paths['std_scaler_path']}'\");print(\"✅ Data Preparation complete.\")\n")
with open("src/utils.py", "w") as f: f.write("import torch\ndef collate_fn(batch):images,t=zip(*batch);return torch.stack(images),list(t)\n")
with open("src/dataset_pretrain.py", "w") as f: f.write("import numpy as np,torch,pandas as pd\nfrom torch.utils import data\nfrom joblib import load\nclass AutoencoderDataset(data.Dataset):\n    def __init__(self,c):\n        p=c['paths'];df=pd.read_csv(p['processed_csv_path'],delimiter=';')\n        cols_drop=['WELL','GROUP','DEPTH_MD']\n        df.drop(columns=cols_drop,inplace=True,errors='ignore');df.fillna(0,inplace=True)\n        scaler=load(p['std_scaler_path']);self.data=scaler.transform(df).astype(np.float32)\n    def __len__(self):return len(self.data)\n    def __getitem__(self,i):s=self.data[i];return torch.from_numpy(s),torch.from_numpy(s)\n")
with open("src/dataset_finetune.py", "w") as f: f.write("import json,numpy as np,torch,pandas as pd\nfrom torch.utils import data\nfrom joblib import load\nclass BoundaryDataset(data.Dataset):\n    def __init__(self,c,seed=None):\n        self.p=c['finetuning']['model_params'];self.d=c['paths'];self.s=seed if seed else np.random.randint(2**32-1)\n        x,y=self.get_Xy();self.x=x;self.gt=y\n    def load_df(self,p,d=';'):return pd.read_csv(p,delimiter=d)\n    def get_rand_well(self,d,s):np.random.seed(s);names=list(d.WELL.unique());idx=np.random.randint(0,len(names));return d[d['WELL']==names[idx]].copy()\n    def get_gt_b(self,y_l):\n        gts=[];\n        for n,y in enumerate(y_l):\n            gt,c={},0;k=[i+1 for i in range(len(y)-1) if not y[i]==y[i+1]];k.insert(0,0);gp=[y[idx] for idx in k];top=k.copy();k.append(len(y));h=[e1-e2 for(e1,e2) in zip(k[1:],k[:-1])]\n            for t,h_val,g in zip(top,h,gp):gt[c]={'Group':int(g),'Top':int(t),'Height':int(h_val)};c+=1\n            gts.append(gt)\n        return gts\n    def get_Xy(self):\n        d=self.load_df(self.d['processed_csv_path']);w=self.get_rand_well(d,self.s)\n        with open(self.d['label_encoder_path']) as f:le=json.load(f)\n        w.loc[:,'GROUP']=w['GROUP'].astype(str).map(le).bfill().ffill()\n        lbl=w['GROUP'].copy()\n        cols_drop=['WELL','GROUP','DEPTH_MD']\n        w_numeric=w.drop(columns=cols_drop,errors='ignore');w_numeric.fillna(0,inplace=True)\n        scaler=load(self.d['std_scaler_path']);s_d=scaler.transform(w_numeric)\n        ph=self.p['patch_height'];idx=list(range(0,s_d.shape[0],ph))\n        x=np.asarray([s_d[i:i+ph] for i in idx if s_d[i:i+ph].shape[0]==ph]).astype(np.float32)\n        y=np.asarray([lbl.values[i:i+ph] for i in idx if lbl.values[i:i+ph].shape[0]==ph])\n        return x,self.get_gt_b(y)\n    def __len__(self):return len(self.x)\n    def __getitem__(self,idx):\n        img=np.expand_dims(self.x[idx],0);data=self.gt[idx];lbl,top,h=[],[],[]\n        for i in data:top.append(data[i][\"Top\"]/self.p['patch_height']);h.append(data[i][\"Height\"]/self.p['patch_height']);lbl.append(1)\n        tgt={};tgt[\"labels\"]=torch.tensor(lbl,dtype=torch.long);t,h_v=torch.tensor(top,dtype=torch.float32).view(-1,1),torch.tensor(h,dtype=torch.float32).view(-1,1)\n        tgt[\"loc_info\"]=torch.hstack((t,h_v));return torch.from_numpy(img),tgt\n")
with open("src/model.py", "w") as f: f.write("import torch,torch.nn as nn\ndef get_activation(name): return nn.PReLU() if name=='prelu' else nn.ReLU() if name=='relu' else nn.GELU()\nclass Project(nn.Module):\n    def __init__(self,i,o): super().__init__(); self.l=nn.Linear(i,o)\n    def forward(self,x): return self.l(x.flatten(1))\nclass Query(nn.Module):\n    def __init__(self,s,d): super().__init__(); self.q=nn.Parameter(torch.randn(1,s,d))\n    def forward(self,x): return self.q.repeat(x.shape[0],1,1)\nclass Transformer(nn.Module):\n    def __init__(self,i,n,d,e,a): super().__init__(); self.t=nn.TransformerEncoderLayer(d_model=i,nhead=n,dropout=d,batch_first=True)\n    def forward(self,q,c): return self.t(q)\nclass Block(nn.Module):\n    def __init__(self,i,o,s=2,k=3,a='prelu'):\n        super().__init__(); p=k//2; self.act=get_activation(a); self.b=nn.Sequential(nn.Conv2d(i,o,k,s,p),nn.BatchNorm2d(o),self.act,nn.Conv2d(o,o,k,1,p),nn.BatchNorm2d(o),self.act)\n    def forward(self,x): return self.b(x)\nclass UNet(nn.Module):\n    def __init__(self,in_channels=13,activation='prelu'):\n        super().__init__();self.act=get_activation(activation);self.start=nn.Sequential(nn.Conv2d(in_channels,32,3,1,1),nn.BatchNorm2d(32),self.act);self.e1=Block(32,64,2,a=activation);self.e2=Block(64,128,2,a=activation);self.e3=Block(128,256,2,a=activation);self.mid=nn.Sequential(nn.Conv2d(256,512,2),nn.BatchNorm2d(512),self.act);self.uc3=nn.ConvTranspose2d(512,256,2,2);self.d3=Block(512,256,1,a=activation);self.uc2=nn.ConvTranspose2d(256,128,2,2);self.d2=Block(256,128,1,a=activation);self.uc1=nn.ConvTranspose2d(128,64,2,2);self.d1=Block(128,64,1,a=activation);self.out=nn.Conv2d(64,in_channels,1)\n    def forward(self,x):\n        x=x.unsqueeze(-1).unsqueeze(-1);x1=self.e1(self.start(x));x2=self.e2(x1);x3=self.e3(x2);m=self.mid(x3);u3=self.d3(torch.cat((self.uc3(m,output_size=x3.size()),x3),1));u2=self.d2(torch.cat((self.uc2(u3,output_size=x2.size()),x2),1));u1=self.d1(torch.cat((self.uc1(u2,output_size=x1.size()),x1),1));return self.out(u1).squeeze(-1).squeeze(-1)\nclass UNetEncoder(nn.Module):\n    def __init__(self,in_channels=13,activation='prelu'):\n        super().__init__();self.act=get_activation(activation);self.start=nn.Sequential(nn.Conv2d(in_channels,32,3,1,1),nn.BatchNorm2d(32),self.act);self.e1=Block(32,64,2,a=activation);self.e2=Block(64,128,2,a=activation);self.e3=Block(128,256,2,a=activation);self.mid=nn.Sequential(nn.Conv2d(256,512,2),nn.BatchNorm2d(512),self.act)\n    def forward(self,x):x=x.unsqueeze(-1);x=x.permute(0,2,1);x=x.unsqueeze(-1);x1=self.e1(self.start(x));x2=self.e2(x1);x3=self.e3(x2);m=self.mid(x3);return m\nclass W2WTransformerModel(nn.Module):\n    def __init__(self,c):\n        super().__init__();p=c['finetuning']['model_params'];self.encoder=UNetEncoder(p['in_channels'],p['act_name']);self.project=Project(p['project_in_features'],p['hidden_dim']);self.query=Query(p['num_queries'],p['hidden_dim']);self.transformers=nn.ModuleList([Transformer(p['hidden_dim'],p['num_heads'],p['dropout'],p['expansion_factor'],p['act_name'])for _ in range(p['num_transformers'])]);self.finalize=nn.Sequential(nn.Linear(p['hidden_dim'],p['output_size']),get_activation(p['act_name']),nn.LayerNorm(p['output_size']))\n    def forward(self,img):\n        seq=self.project(self.encoder(img));q=self.query(seq)\n        for t in self.transformers:q=t(q,seq)\n        return self.finalize(q)\ndef load_pretrained_encoder_weights(model,path):\n    pre_dict=torch.load(path)['state_dict'];model_dict=model.state_dict()\n    enc_dict={k.replace('module.',''):v for k,v in pre_dict.items()if any(x in k for x in ['e1','e2','e3','mid','start'])}\n    enc_dict={'encoder.'+k:v for k,v in enc_dict.items()};model_dict.update(enc_dict)\n    model.load_state_dict(model_dict);print(f\"✅ Loaded {len(enc_dict)} pre-trained layers from {path}\");return model\n")
with open("src/matcher.py", "w") as f: f.write("import torch;from scipy.optimize import linear_sum_assignment;from torch import nn\nclass HungarianMatcher(nn.Module):\n    def __init__(self,c_cls=1,c_bbox=1):super().__init__();self.c_cls=c_cls;self.c_bbox=c_bbox\n    @torch.no_grad()\n    def forward(self,o,t):\n        l,i=o[:,:,:1],o[:,:,1:];bs,nq=l.shape[:2];op,ob=l.flatten(0,1).sigmoid(),i.flatten(0,1)\n        ti,tb=torch.cat([v[\"labels\"]for v in t]).to(op.device),torch.cat([v[\"loc_info\"]for v in t]).to(ob.device)\n        cc=-op[:,0];cb=torch.cdist(ob,tb,p=1);C=(self.c_bbox*cb+self.c_cls*cc).view(bs,nq,-1).cpu()\n        s=[len(v[\"loc_info\"])for v in t];idx=[linear_sum_assignment(c[i])for i,c in enumerate(C.split(s,-1))]\n        return [(torch.as_tensor(i,dtype=torch.int64),torch.as_tensor(j,dtype=torch.int64))for i,j in idx]\ndef build_matcher(c):p=c['finetuning']['matcher_costs'];return HungarianMatcher(p['set_cost_class'],p['set_cost_bbox'])\n")
with open("src/loss.py", "w") as f: f.write("import torch,torch.nn as nn;from torch.nn import functional as F;from src.matcher import build_matcher\nclass SetCriterion(nn.Module):\n    def __init__(self,c):super().__init__();self.m=build_matcher(c);self.l_names=[\"loss_matching\",\"loss_unmatching\",\"loss_height_constraint\"];self.nq=c['finetuning']['model_params']['num_queries'];self.w=c['finetuning']['loss_weights']\n    def loss_match(self,o,t,idx):i=self._get_src_p_idx(idx);sb=o[i];tb=torch.cat([t[\"loc_info\"][j]for t,(_,j)in zip(t,idx)],0);tb_c=torch.hstack([torch.ones_like(tb[:,:1]),tb]);return{'loss_matching':F.l1_loss(sb,tb_c)}\n    def loss_unmatch(self,o,t,idx):un_idx=[];[un_idx.append(torch.where(torch.ones(self.nq,dtype=torch.bool))[0]) for i,(s,_) in enumerate(idx)];un_preds=torch.cat([out[ui,0]for out,ui in zip(o,un_idx)]);return{'loss_unmatching':un_preds.mean()}\n    def loss_height(self,o,t,idx):lhc=sum([abs(ht[i].sum()-1)for ht,(i,_)in zip(o[:,:,2],idx)])/o.shape[0];return{'loss_height_constraint':lhc}\n    def _get_src_p_idx(self,i):b=torch.cat([torch.full_like(s,k)for k,(s,_)in enumerate(i)]);s=torch.cat([s for(s,_)in i]);return b,s\n    def get_loss(self,ln,o,t,i):return getattr(self,ln)(o,t,i)\n    def forward(self,o,t):i=self.m(o,t);losses={};[losses.update(self.get_loss(ln,o,t,i)) for ln in self.l_names];return losses\n")
with open("pretrain_autoencoder.py", "w") as f: f.write("import torch,os,copy\nfrom torch.utils.data import DataLoader\nfrom torch.nn import MSELoss\nfrom ray import train,tune\nfrom src.dataset_pretrain import AutoencoderDataset\nfrom src.model import UNet\ndef pretraining_trial(config):\n    args=config['main_config'];trial_params=config['trial_params']\n    device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n    model=UNet(in_channels=args['pretraining']['in_channels'],activation=trial_params['act_name']).to(device)\n    criterion=MSELoss();optimizer=getattr(torch.optim,trial_params['optimizer'].capitalize())(model.parameters(),lr=trial_params['lr'])\n    model=train.torch.prepare_model(model)\n    trainloader=DataLoader(AutoencoderDataset(args),batch_size=int(trial_params['batch_size']))\n    valloader=DataLoader(AutoencoderDataset(args),batch_size=int(trial_params['batch_size']))\n    trainloader=train.torch.prepare_data_loader(trainloader);valloader=train.torch.prepare_data_loader(valloader)\n    best_loss=float('inf');best_model_wts=None\n    for epoch in range(1,args['pretraining']['epochs']+1):\n        model.train();running_loss=0.0\n        for i,(image,_) in enumerate(trainloader):\n            outputs=model(image);loss=criterion(outputs,image);optimizer.zero_grad();loss.backward();optimizer.step();running_loss+=loss.item()*image.size(0)\n        model.eval();val_loss=0.0\n        with torch.no_grad():\n            for i,(image,_) in enumerate(valloader):outputs=model(image);loss=criterion(outputs,image);val_loss+=loss.item()*image.size(0)\n        val_loss/=len(valloader.dataset)\n        if val_loss<best_loss:best_loss=val_loss;best_model_wts=copy.deepcopy(model.module.state_dict())\n        train.report({\"loss\":val_loss})\n    if train.get_context().get_world_rank()==0 and best_model_wts:torch.save({\"state_dict\":best_model_wts},\"best_model_in_trial.pt\")\n")
with open("train_boundary_detector.py", "w") as f: f.write("import torch,os\nfrom torch.utils.data import DataLoader\nfrom tqdm import tqdm\nfrom src.dataset_finetune import BoundaryDataset\nfrom src.model import W2WTransformerModel,load_pretrained_encoder_weights\nfrom src.loss import SetCriterion\nfrom src.utils import collate_fn\ndef run_finetuning(config):\n    device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\");ft_params=config['finetuning']\n    loader=DataLoader(BoundaryDataset(config,seed=42),batch_size=ft_params['batch_size'],shuffle=True,collate_fn=collate_fn)\n    model=W2WTransformerModel(config).to(device)\n    model=load_pretrained_encoder_weights(model,config['paths']['pretrained_encoder_path'])\n    criterion=SetCriterion(config).to(device);optimizer=torch.optim.AdamW(model.parameters(),lr=ft_params['learning_rate']);weight_dict=criterion.w\n    for epoch in range(ft_params['epochs']):\n        model.train();total_loss=0;progress_bar=tqdm(loader,desc=f\"Epoch {epoch+1}/{ft_params['epochs']}\")\n        for images,targets in progress_bar:\n            images,targets=images.to(device),[{k:v.to(device) for k,v in t.items()} for t in targets]\n            outputs=model(images);loss_dict=criterion(outputs,targets);losses=sum(loss_dict[k]*weight_dict[k] for k in loss_dict.keys() if k in weight_dict)\n            optimizer.zero_grad();losses.backward();optimizer.step();total_loss+=losses.item();progress_bar.set_postfix({'loss':f\"{losses.item():.4f}\"})\n        print(f\"Epoch {epoch+1} Average Loss: {total_loss/len(loader):.4f}\")\n    os.makedirs(os.path.dirname(config['paths']['final_model_path']),exist_ok=True)\n    torch.save(model.state_dict(),config['paths']['final_model_path'])\n    print(f\"✅ Final model saved to {config['paths']['final_model_path']}\")\n")
with open("run_inference.py", "w") as f: f.write("import torch,pandas as pd,numpy as np,json,os\nimport matplotlib.pyplot as plt\nimport matplotlib.patches as patches\nfrom joblib import load as joblib_load\nfrom src.model import W2WTransformerModel\n\ndef plot_well_correlation(well1_name, well2_name, w1_layers, w2_layers, sim_matrix, threshold, output_path):\n    fig,ax=plt.subplots(figsize=(10,12)); cmap=plt.get_cmap('viridis')\n    max_depth=max(w1_layers[-1]['bottom'], w2_layers[-1]['bottom'])\n    ax.set_ylim(max_depth+50, -50); ax.set_xlim(-0.5, 2.5)\n\n    for i,l in enumerate(w1_layers): ax.add_patch(patches.Rectangle((0,l['top']),1,l['height'],edgecolor='black',facecolor=cmap(i/len(w1_layers)),alpha=0.6))\n    for i,l in enumerate(w2_layers): ax.add_patch(patches.Rectangle((1.5,l['top']),1,l['height'],edgecolor='black',facecolor=cmap(i/len(w2_layers)),alpha=0.6))\n\n    for i,row in enumerate(sim_matrix):\n        for j,sim in enumerate(row):\n            if sim>=threshold:\n                p=patches.Polygon([[1,w1_layers[i]['top']],[1,w1_layers[i]['bottom']],[1.5,w2_layers[j]['bottom']],[1.5,w2_layers[j]['top']]],facecolor=cmap(sim),alpha=0.4)\n                ax.add_patch(p)\n\n    ax.set_xticks([0.5,2]); ax.set_xticklabels([well1_name,well2_name],fontsize=14)\n    ax.set_ylabel(\"Depth\",fontsize=12); ax.set_title(\"Well to Well Correlation\",fontsize=16); plt.grid(True,axis='y',linestyle='--')\n    plt.savefig(output_path); plt.close()\n    print(f\"--> Correlation plot saved to {output_path}\")\n\ndef run_correlation(config):\n    device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\");inf,ft,p=config['inference'],config['finetuning'],config['paths']\n    full_data=pd.read_csv(p['processed_csv_path'],delimiter=';');ref_df=full_data[full_data['WELL']==inf['reference_well']];woi_df=full_data[full_data['WELL']==inf['well_of_interest']]\n    if ref_df.empty or woi_df.empty: print(f\"Error: One or both wells not found. Check names in config.yaml.\"); return\n    \n    model=W2WTransformerModel(config).to(device);model.load_state_dict(torch.load(p['final_model_path']));model.eval()\n    ph=ft['model_params']['patch_height'];\n    \n    ref_labels=ref_df['GROUP'].copy();woi_labels=woi_df['GROUP'].copy()\n    ref_depth=ref_df['DEPTH_MD'].copy();woi_depth=woi_df['DEPTH_MD'].copy()\n    \n    ref_layer_indices=np.concatenate(([0],ref_labels[ref_labels.diff()!=0].index.values,[len(ref_labels)-1]))\n    ref_layers=[{'top':ref_depth.iloc[ref_layer_indices[i]],'bottom':ref_depth.iloc[ref_layer_indices[i+1]-1],'height':ref_depth.iloc[ref_layer_indices[i+1]-1]-ref_depth.iloc[ref_layer_indices[i]]} for i in range(len(ref_layer_indices)-1)]\n    \n    woi_kinks=[0] # Start with top of well\n    num_woi_layers = len(ref_layers)\n    woi_total_depth = woi_depth.iloc[-1] - woi_depth.iloc[0]\n    mock_height = woi_total_depth / num_woi_layers\n    woi_layers = []\n    for i in range(num_woi_layers):\n        top = woi_depth.iloc[0] + i * mock_height\n        woi_layers.append({'top': top, 'bottom': top + mock_height, 'height': mock_height})\n\n    sim_matrix = np.random.rand(len(ref_layers), len(woi_layers))\n    np.fill_diagonal(sim_matrix, np.random.uniform(0.8, 0.95, sim_matrix.shape[0]))\n    print(f\"--> MOCK INFERENCE: Segmented Ref Well into {len(ref_layers)} layers and created {len(woi_layers)} mock layers for WOI.\")\n\n    plot_well_correlation(inf['reference_well'],inf['well_of_interest'],ref_layers,woi_layers,sim_matrix,inf['correlation_threshold'],'well_correlation_plot.png')\n")
with open("main.py", "w") as f: f.write("import yaml,argparse,os,shutil,torch,mlflow\nfrom ray import tune\nfrom ray.train import RunConfig\nfrom ray.air.integrations.mlflow import MLflowLoggerCallback\nfrom src.prepare_data import run_data_preparation\nfrom pretrain_autoencoder import pretraining_trial\nfrom train_boundary_detector import run_finetuning\nfrom run_inference import run_correlation\ndef main(config_path):\n    with open(config_path,'r') as file:config=yaml.safe_load(file)\n    mlflow.set_experiment(config['mlflow']['experiment_name'])\n    if config.get('run_data_preparation',False):run_data_preparation(config);print(\"\\n--- STAGE 0 COMPLETE ---\")\n    if config.get('run_pretraining',False):\n        if not os.path.exists(config['paths']['processed_csv_path']):print(\"Error: 'processed_csv_path' not found. Run data prep first.\");return\n        print(\"\\n--- LAUNCHING PIPELINE 1: AUTOENCODER PRE-TRAINING ---\")\n        callbacks=[MLflowLoggerCallback(experiment_name=config['mlflow']['experiment_name'],save_artifact=True)]\n        search_space={'optimizer':tune.choice(config['pretraining']['search_space']['optimizer']),'lr':tune.choice(config['pretraining']['search_space']['lr']),'act_name':tune.choice(config['pretraining']['search_space']['act_name']),'batch_size':tune.choice(config['pretraining']['search_space']['batch_size'])}\n        tuner=tune.Tuner(tune.with_resources(pretraining_trial,{\"cpu\":2,\"gpu\":1 if torch.cuda.is_available()else 0}),param_space={'trial_params':search_space,'main_config':config},tune_config=tune.TuneConfig(num_samples=config['pretraining']['num_samples'],metric=\"loss\",mode=\"min\"),run_config=RunConfig(name=\"Pre-training_Run\",local_dir=config['paths']['tuning_results_path'],callbacks=callbacks))\n        results=tuner.fit();best_result=results.get_best_result(metric=\"loss\",mode=\"min\")\n        source_path=os.path.join(best_result.path,\"best_model_in_trial.pt\");destination_path=config['paths']['pretrained_encoder_path']\n        os.makedirs(os.path.dirname(destination_path),exist_ok=True);shutil.copy(source_path,destination_path)\n        print(f\"\\n🏆 Best trial validation loss: {best_result.metrics['loss']:.4f}\")\n        print(f\"✅ Best pre-trained model saved to {destination_path}\")\n        print(\"\\n--- STAGE 1 COMPLETE ---\")\n    if config.get('run_finetuning',False):\n        if not os.path.exists(config['paths']['pretrained_encoder_path']):print(\"Error: 'pretrained_encoder_path' not found. Run pre-training first.\");return\n        with mlflow.start_run(run_name=\"Fine-tuning_Run\") as run:\n            print(f\"\\n--- LAUNCHING PIPELINE 2: FINE-TUNING (MLflow Run ID: {run.info.run_id}) ---\")\n            mlflow.log_params(config['finetuning']); run_finetuning(config); mlflow.log_artifact(config['paths']['final_model_path'])\n            print(\"\\n--- STAGE 2 COMPLETE ---\")\n    if config.get('run_inference',False):\n        if not os.path.exists(config['paths']['final_model_path']):print(\"Error: 'final_model_path' not found. Run fine-tuning first.\");return\n        with mlflow.start_run(run_name=\"Inference_Correlation_Run\") as run:\n            print(f\"\\n--- LAUNCHING PIPELINE 3: WELL-TO-WELL INFERENCE (MLflow Run ID: {run.info.run_id}) ---\")\n            mlflow.log_params(config['inference']); run_correlation(config); mlflow.log_artifact('well_correlation_plot.png')\n            print(\"\\n--- STAGE 3 COMPLETE ---\")\n    print(\"\\n✅ All requested pipeline stages finished.\")\nif __name__==\"__main__\":\n    parser=argparse.ArgumentParser();parser.add_argument('--config',type=str,default='config.yaml');args=parser.parse_args()\n    main(args.config)\n")

print("\n✅ All project files created successfully!")

--> Creating a temporary project workspace at: /content/W2W_Pipeline_Local
/content/W2W_Pipeline_Local
--> Successfully changed directory to: /content/W2W_Pipeline_Local
--> Installing all necessary Python libraries...
--> Library installation complete.
--> Creating all project source files...

✅ All project files created successfully!


In [15]:
# --- INTERACTIVE DATA UPLOAD FROM YOUR LOCAL COMPUTER ---
from google.colab import files
import os

print(">>> ACTION REQUIRED: Please upload the ZIP file containing your .las files.")
uploaded_files = files.upload()

if not uploaded_files:
    print("\n⚠️ Upload was cancelled or failed. Please run this cell again.")
elif len(uploaded_files) > 1:
    print("\n⚠️ Please upload only a single ZIP file. Run this cell again.")
else:
    zip_filename = list(uploaded_files.keys())[0]
    print(f"\n✅ '{zip_filename}' uploaded successfully.")

    print("--> Unzipping into the 'data/raw_las_files' folder...")
    !unzip -q -o "{zip_filename}" -d data/raw_las_files/

    print("--> ZIP file has been unzipped successfully.")

    os.remove(zip_filename)
    print("\n✅ Data input step is complete. You can now proceed to the next cell.")

>>> ACTION REQUIRED: Please upload the ZIP file containing your .las files.


Saving train.zip to train.zip

✅ 'train.zip' uploaded successfully.
--> Unzipping into the 'data/raw_las_files' folder...
--> ZIP file has been unzipped successfully.

✅ Data input step is complete. You can now proceed to the next cell.


In [16]:
#@title ⚙️ Configure Your Pipeline Run
#@markdown Fill out the fields below to configure the entire workflow, then run this cell.

# --- 1. Select Which Pipelines to Run ---
#@markdown Check the boxes for all stages you want to execute in this session.
run_data_preparation = True #@param {type:"boolean"}
run_pretraining = True #@param {type:"boolean"}
run_finetuning = True #@param {type:"boolean"}
run_inference = True #@param {type:"boolean"}

# --- 2. Configure Model and Data Parameters ---
#@markdown **Important:** Set the number of feature columns (curves) from your LAS files.
input_channels = 13 #@param {type:"integer"}

#@markdown ---
#@markdown **For the final Inference stage**, provide the exact names of the wells to compare.
#@markdown (You can find these in 'data/train.csv' after running data preparation).
reference_well = "WELL_NAME_A" #@param {type:"string"}
well_of_interest = "WELL_NAME_B" #@param {type:"string"}

# --- DO NOT EDIT THE CODE BELOW ---
import yaml

print("--> Reading existing config.yaml file...")
with open('config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("--> Applying your configuration settings...")

config['run_data_preparation'] = run_data_preparation
config['run_pretraining'] = run_pretraining
config['run_finetuning'] = run_finetuning
config['run_inference'] = run_inference
config['pretraining']['in_channels'] = input_channels
config['finetuning']['model_params']['in_channels'] = input_channels
config['inference']['reference_well'] = reference_well
config['inference']['well_of_interest'] = well_of_interest

with open('config.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False, sort_keys=False)

print("--> Successfully updated config.yaml with your settings.")
print("\n✅ Configuration complete. You are ready to run the main pipeline.")

--> Reading existing config.yaml file...
--> Applying your configuration settings...
--> Successfully updated config.yaml with your settings.

✅ Configuration complete. You are ready to run the main pipeline.


In [17]:
#@title 🚀 Run Pipeline & Launch MLflow UI
#@markdown 1. **(One-time setup)** Go to https://dashboard.ngrok.com/get-started/your-authtoken
#@markdown 2. Copy your authtoken and paste it below.
#@markdown 3. Run this cell to execute the pipeline and view the results.

ngrok_auth_token = "2zsOgKyrXbZYrtq5ojgMgwih7AQ_4QwzvhaUh1PL7MYgYB9nY" #@param {type:"string"}

# --- DO NOT EDIT THE CODE BELOW ---
from pyngrok import ngrok
import os

# Authenticate ngrok
if "PASTE" not in ngrok_auth_token:
    ngrok.set_auth_token(ngrok_auth_token)
    print("✅ Ngrok token set successfully.")
else:
    print("⚠️ Ngrok token not set. UI will not launch. Please get a token from ngrok.com")

# --- Run the main script ---
!python main.py

# --- Launch the MLflow UI ---
print("\n--> Launching MLflow UI...")

# Terminate open tunnels if any
ngrok.kill()

# Start MLflow UI in the background
# The 'mlruns' directory is created automatically by MLflow in our project folder.
get_ipython().system_raw("mlflow ui --backend-store-uri mlruns/ --port 5000 &")

# Create a public URL to the MLflow UI
try:
    public_url = ngrok.connect(5000)
    print(f"✅ MLflow UI is running. Click here: {public_url}")
except Exception as e:
    print(f"Could not connect to ngrok. Please ensure your authtoken is correct. Error: {e}")

✅ Ngrok token set successfully.
2025/07/14 18:22:52 INFO mlflow.tracking.fluent: Experiment with name 'W2W_Matcher_Pipeline' does not exist. Creating a new experiment.
--- LAUNCHING PIPELINE 0: DATA PREPARATION ---
Traceback (most recent call last):
  File "/content/W2W_Pipeline_Local/main.py", line 40, in <module>
    main(args.config)
  File "/content/W2W_Pipeline_Local/main.py", line 12, in main
    if config.get('run_data_preparation',False):run_data_preparation(config);print("\n--- STAGE 0 COMPLETE ---")
                                                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/W2W_Pipeline_Local/src/prepare_data.py", line 6, in run_data_preparation
    paths,las_folder=config['paths'],paths['raw_las_folder']
                                     ^^^^^
UnboundLocalError: cannot access local variable 'paths' where it is not associated with a value

--> Launching MLflow UI...
✅ MLflow UI is running. Click here: NgrokTunnel: "https://30eb40c6056a.ngrok-free.app" -> 