In [3]:
import pandas as pd

columns = [
    "Cable_ID","Voltage_Level","Feeder_ID","Sub_Feeder_ID","From_Switch","To_Switch",
    "Cable_Type","Cable_Age_Years","Length_m","Installation_Environment",
    "Soil_Type","Humidity","Proximity_to_Water","Load_History_Avg_Load","Load_History_Peak_Load", 
    "Loading_Cycles","Overload_Events","IR_Measurement_MOhm","Tan_Delta","Partial_Discharge_Frequency",
    "Partial_Discharge_Intensity","Thermal_History_Excursions","Num_Faults","Fault_Type","Repairs_Count",
    "Joint_History","Corrosivity","Water_Ingress","Remarks"
]

# Sample data for different voltage levels. You can add or import your actual data here.
data = [
    [
        "33KV-SW1-SW2", "33kV", "", "", "SW1", "SW2", "XLPE", 7, 1200, "Underground",
        "Sandy", "Medium", "Far", 60, 100, 150, 1, 500, 0.001, 0, 0, 0, 0, "None", 0, "Original", "Low", "No", "Healthy"
    ],
    [
        "22KV-SW3-SW4", "22kV", "", "", "SW3", "SW4", "PILC", 3, 800, "Overhead",
        "Clay", "High", "Near", 50, 90, 100, 0, 400, 0.002, 0, 0, 1, 1, "Earth Fault", 1, "Repaired once", "Medium", "No", "Monitor"
    ],
    [
        "FDR1-DT001", "11kV", "FDR1", "", "", "", "XLPE", 8, 400, "Underground",
        "Loam", "Medium", "Near", 40, 80, 120, 2, 120, 0.003, 1, 3, 1, 2, "Earth Fault", 2, "Multiple joints", "High", "Yes", "Healthy"
    ],
    [
        "FDR2-SUB1-DT005", "11kV", "FDR2", "SUB1", "", "", "XLPE", 4, 220, "Underground",
        "Rocky", "Low", "Far", 30, 50, 90, 0, 200, 0.001, 0, 0, 0, 0, "None", 0, "Original", "Low", "No", ""
    ],
]

df = pd.DataFrame(data, columns=columns)

# Save as Excel and CSV for use
# df.to_excel("/media/sagarkumar/New Volume/SAGAR/DATA_GENERATION/master_cable_data_final.xlsx", index=False)
df.to_csv("/media/sagarkumar/New Volume/SAGAR/DATA_GENERATION/master_cable_data_final.csv", index=False)

print(df)


          Cable_ID Voltage_Level Feeder_ID Sub_Feeder_ID From_Switch  \
0     33KV-SW1-SW2          33kV                                 SW1   
1     22KV-SW3-SW4          22kV                                 SW3   
2       FDR1-DT001          11kV      FDR1                             
3  FDR2-SUB1-DT005          11kV      FDR2          SUB1               

  To_Switch Cable_Type  Cable_Age_Years  Length_m Installation_Environment  \
0       SW2       XLPE                7      1200              Underground   
1       SW4       PILC                3       800                 Overhead   
2                 XLPE                8       400              Underground   
3                 XLPE                4       220              Underground   

   ... Partial_Discharge_Frequency Partial_Discharge_Intensity  \
0  ...                           0                           0   
1  ...                           0                           0   
2  ...                           1                

In [None]:
import pandas as pd
import requests
import json
from tqdm import tqdm
import csv

# Load your data
df = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/2-year-data/TXN_NMS_HTLOGSHEET_SUPERFIX.csv')

# Prepare for LLM extraction
def ollama_extract(remarks):
    prompt = f"""
Given this raw fault log text (can contain newlines, commas, or quotes):

\"\"\"{remarks}\"\"\"

Extract these fields as JSON (return JSON object only):
- FAULT_TYPE: (Cable damage, Joint failure, Insulation breakdown, Third party damage, Termination failure, Under investigation, Not cable fault, Other)
- Size
- Insulation
- Voltage
- Type
- FROM
- FROM_SWITCH
- TO
- TO_SWITCH
- DELAYED_REASON
- FAULT_NATURE

If not found, use "" (empty string) as value. No explanation, only JSON.
"""

    response = requests.post(
        "http://localhost:11434/api/generate",
        json={
            "model": "deepseek-r1:70b",
            "prompt": prompt,
            "stream": False,
        },
        timeout=300
    )

    try:
        full_response = response.json()
        # Ollama returns {'response': 'JSON here', ...}
        result = full_response['response']
        # Sometimes model returns codeblock, so strip backticks
        if result.startswith("```") and result.endswith("```"):
            result = result.strip("`").replace("json", "").strip()
        return json.loads(result)
    except Exception as e:
        print("Error:", e, "\nRaw model output:", response.text)
        return {k:"" for k in [
            'FAULT_TYPE','Size','Insulation','Voltage','Type','FROM','FROM_SWITCH',
            'TO','TO_SWITCH','DELAYED_REASON','FAULT_NATURE'
        ]}

# Only run a sample for demo, or remove .head(10) for all data
remarks_list = df['FREE_REMARKS'].astype(str).fillna("").head(10)

results = []
for r in tqdm(remarks_list, desc="LLM extracting"):
    results.append(ollama_extract(r))

llm_df = pd.DataFrame(results)

# Combine with original, as needed (here, only index and FREE_REMARKS)
output_df = pd.concat([df.head(len(llm_df)).reset_index(drop=True), llm_df], axis=1)

# Save with robust quoting (handles commas, quotes, newlines)
output_df.to_csv(
    "/media/sagarkumar/New Volume/SAGAR/DATA_GENERATION/processed_fault_data_LLM.csv",
    index=False,
    quoting=csv.QUOTE_ALL
)

print("Saved to processed_fault_data_LLM.csv")
print(output_df.head())
