CHECK THE DATA 

In [24]:
import csv

file_path = '/media/sagarkumar/New Volume/SAGAR/Book 2(Sheet1).csv'
linebreak_count = 0
ctrl_count = 0

with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        for j, cell in enumerate(row):
            if '\n' in cell or '\r' in cell:
                print(f'Linebreak found in row {i+1}, column {j+1}: {repr(cell)}')
                linebreak_count += 1
            # Any ASCII control char except tab (9) and newline (10, 13)
            if any(ord(c) < 32 and ord(c) not in (9, 10, 13) for c in cell):
                print(f'Ctrl char found in row {i+1}, column {j+1}: {repr(cell)}')
                ctrl_count += 1
      

print(f'Total linebreak cells found: {linebreak_count}')
print(f'Total control-char cells found: {ctrl_count}')


Total linebreak cells found: 0
Total control-char cells found: 0


In [25]:
with open(file_path, 'r', encoding='utf-8') as f:
    expected = None
    for i, line in enumerate(f):
        cols = line.count(',') + 1
        if i == 0:
            expected = cols
        elif cols != expected:
            print(f"Row {i+1} has {cols} columns (Expected: {expected})")
      

In [26]:
import pandas as pd
import re

# Raw file read, no parse (just lines)
filename = "/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv"
with open(filename, encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

print("Total lines:", len(lines))

# Line breaks within a field detection (very likely in 'FREE_REMARKS' column)
for i, line in enumerate(lines):
    if '\n' in line or '\r' in line:
        # \n to har line me hoga, but internal bhi ho sakta hai
        if line.count('\n') > 1 or line.count('\r') > 1:
            print(f"Line {i+1}: Multiple linebreaks in a single line")

    # Look for possible tabs, weird ascii
    if re.search(r"[\t\x0b\x0c\x1b]", line):
        print(f"Line {i+1}: Contains tab or control char")

# Check for unclosed quotes
for i, line in enumerate(lines):
    if line.count('"') % 2 != 0:
        print(f"Line {i+1}: Unmatched double quote")

# Column count (by comma)
col_counts = [l.count(',') for l in lines]
mode_col_count = max(set(col_counts), key=col_counts.count)
for idx, c in enumerate(col_counts):
    if c != mode_col_count:
        print(f"Line {idx+1}: {c} columns (Expected: {mode_col_count})")


Total lines: 11058


CLEANING THE DATA

In [16]:
import csv
import re

input_file_path = '/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1.csv'
output_file_path = '/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv'

def clean_cell(cell):
    # Remove any ASCII control characters except tab (9), LF (10), CR (13)
    cell = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', cell)
    # Replace linebreaks within a cell with space
    cell = cell.replace('\n', ' ').replace('\r', ' ')
    return cell

with open(input_file_path, 'r', encoding='utf-8', errors='replace', newline='') as infile, \
     open(output_file_path, 'w', encoding='utf-8', errors='replace', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    for row in reader:
        new_row = [clean_cell(cell) for cell in row]
        writer.writerow(new_row)

print(f"Cleaned file written to {output_file_path}")


Cleaned file written to /media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv


PROCCESING THE DATA 

In [None]:
import pandas as pd
import re

# 1. Load file (final clean file)
df_original = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/TXN_NMS_HTLOGSHEET_cleaned.csv', low_memory=False)

# 2. Filter ENTRY_TYPE == 1
df_entry1 = df_original[df_original['ENTRY_TYPE'] == 1].copy().reset_index(drop=True)

# 3. UPPERCASE remarks
df_entry1['FREE_REMARKS_UPPER'] = df_entry1['FREE_REMARKS'].astype(str).str.upper()

# 4. Extraction functions
def extract_size(text):
    m = re.search(r'X\s*([\d\.\+\-\s]*SQ\.?\s*MM)', text)
    return m.group(1).replace(" ", "") if m else ""

def extract_insulation(text):
    m = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text)
    return m.group(1) if m else ""

def extract_voltage(text):
    m = re.search(r'(\d{2,3})\s*KV', text)
    return f"{m.group(1)}KV" if m else ""

def extract_type(text):
    m = re.search(r'(HTCF SECTION|LT SECTION|HT SECTION)', text)
    return m.group(1) if m else ""

def extract_section(text):
    m = re.search(r'BETWEEN\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)\s+TO\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)', text)
    if m:
        return pd.Series([m.group(1).strip(), m.group(2), m.group(3).strip(), m.group(4)])
    return pd.Series(["", "", "", ""])

def extract_delayed_reason(text):
    m = re.search(r'DELAYED DUE TO(.*?)(?:NOTIFICATION NO|$)', text)
    return m.group(1).strip() if m else ""

def extract_fault_nature(text):
    patterns = [
        'CABLE DAMAGED BY', 'DMS OFFLINE AT', 'FPI MALFUNCTION',
        'FAILED TO OPEN', 'FAILED TO CLOSE', 'WENT OFFLINE',
        'FEEDER TRIPPING', 'TRAFFIC ON', 'SUPPLY RESTORATION DELAYED'
    ]
    found = []
    for pat in patterns:
        for m in re.finditer(r'({0}.*?)(?:\.|,|;|NOTIFICATION NO|$)'.format(re.escape(pat)), text):
            found.append(m.group(1).strip())
    return '; '.join(found) if found else ""

# 5. Extraction (create new DataFrame for output)
out = pd.DataFrame()
out['Size'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_size)
out['Insulation'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_insulation)
out['Voltage'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_voltage)
out['Type'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_type)

section_cols = df_entry1['FREE_REMARKS_UPPER'].apply(extract_section)
section_cols.columns = ['FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH']
out = pd.concat([out, section_cols], axis=1)

out['DELAYED_REASON'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_delayed_reason)
out['FAULT_NATURE'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_fault_nature)

# Time columns
out['TIME_OUTAGE'] = df_entry1['TIME_OUTAGE'].astype(str)
out['MAIN_REPORTED_TIME'] = pd.to_datetime(df_entry1['MAIN_REPORTED_TIME'], errors='coerce')
out['TIME_RESTORED'] = pd.to_datetime(df_entry1['TIME_RESTORED'], errors='coerce')
out['TIME_DIFFERENCE'] = (out['TIME_RESTORED'] - out['MAIN_REPORTED_TIME']).dt.total_seconds() / 60

# 6. Save
final_cols = [
    'Size', 'Insulation', 'Voltage', 'Type',
    'FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH',
    'DELAYED_REASON', 'FAULT_NATURE',
    'TIME_OUTAGE', 'MAIN_REPORTED_TIME', 'TIME_RESTORED', 'TIME_DIFFERENCE'
]
out[final_cols].to_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_FINAL.csv', index=False)
print("Extracted file written.")
print(out[final_cols].head(10))


ONLY FREE_REMARKS ANALYZE


In [30]:
import pandas as pd
import re

# 1. Load the file
df = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/TXN_NMS_HTLOGSHEET_FREE_REMARKS_ONLY.csv', usecols=['FREE_REMARKS'], low_memory=False)

# 2. UPPERCASE remarks for consistency
df['FREE_REMARKS_UPPER'] = df['FREE_REMARKS'].astype(str).str.upper()

# 3. Extraction functions
def extract_size(text):
    m = re.search(r'X\s*([\d\.\+\-\s]*SQ\.?\s*MM)', text)
    return m.group(1).replace(" ", "") if m else ""

def extract_insulation(text):
    m = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text)
    return m.group(1) if m else ""

def extract_voltage(text):
    m = re.search(r'(\d{2,3})\s*KV', text)
    return f"{m.group(1)}KV" if m else ""

def extract_type(text):
    m = re.search(r'(HTCF SECTION|LT SECTION|HT SECTION)', text)
    return m.group(1) if m else ""

def extract_section(text):
    m = re.search(r'BETWEEN\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)\s+TO\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)', text)
    if m:
        return pd.Series([m.group(1).strip(), m.group(2), m.group(3).strip(), m.group(4)])
    return pd.Series(["", "", "", ""])

def extract_delayed_reason(text):
    m = re.search(r'DELAYED DUE TO(.*?)(?:NOTIFICATION NO|$)', text)
    return m.group(1).strip() if m else ""

def extract_fault_nature(text):
    patterns = [
        'CABLE DAMAGED BY', 'DMS OFFLINE AT', 'FPI MALFUNCTION',
        'FAILED TO OPEN', 'FAILED TO CLOSE', 'WENT OFFLINE',
        'FEEDER TRIPPING', 'TRAFFIC ON', 'SUPPLY RESTORATION DELAYED'
    ]
    found = []
    for pat in patterns:
        for m in re.finditer(r'({0}.*?)(?:\.|,|;|NOTIFICATION NO|$)'.format(re.escape(pat)), text):
            found.append(m.group(1).strip())
    return '; '.join(found) if found else ""

# 4. Apply extraction functions
out = pd.DataFrame()
out['Size'] = df['FREE_REMARKS_UPPER'].apply(extract_size)
out['Insulation'] = df['FREE_REMARKS_UPPER'].apply(extract_insulation)
out['Voltage'] = df['FREE_REMARKS_UPPER'].apply(extract_voltage)
out['Type'] = df['FREE_REMARKS_UPPER'].apply(extract_type)

section_cols = df['FREE_REMARKS_UPPER'].apply(extract_section)
section_cols.columns = ['FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH']
out = pd.concat([out, section_cols], axis=1)

out['DELAYED_REASON'] = df['FREE_REMARKS_UPPER'].apply(extract_delayed_reason)
out['FAULT_NATURE'] = df['FREE_REMARKS_UPPER'].apply(extract_fault_nature)

# 5. Save results
out.to_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_ONLY_FREEREMARKS.csv', index=False)
print("Extracted file written.")
print(out.head(10))


Extracted file written.
                        Size Insulation Voltage          Type  \
0                                         220KV                 
1                                                               
2                      SQ.MM  PILC+XLPE    11KV  HTCF SECTION   
3            .06+70+120SQ.MM  PILC+XLPE    11KV  HTCF SECTION   
4  0.15+185+225+300+240SQ.MM  PILC+XLPE    11KV  HTCF SECTION   
5                   120SQ.MM       PILC    11KV  HTCF SECTION   
6                   120SQ.MM       PILC    11KV  HTCF SECTION   
7                   120SQ.MM       PILC    11KV  HTCF SECTION   
8                   120SQ.MM       PILC    11KV  HTCF SECTION   
9                   120SQ.MM       PILC    11KV  HTCF SECTION   

                       FROM FROM_SWITCH                      TO TO_SWITCH  \
0                                                                           
1                                                                           
2  VISHWESHWAR NAGAR HETALI  

In [9]:
#!/usr/bin/env python3
# -----------------------------------------------------------
#  Pull structured fields out of HT-cable FREE_REMARKS strings
# -----------------------------------------------------------
import re
import pandas as pd

# ─── Edit these two paths ───────────────────────────────────
SRC  = r'/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/Book1.csv'   # has FREE_REMARKS column
DEST = r'/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_ONLY_FREEREMARKS.csv'
# ─────────────────────────────────────────────────────────────

# 1) Load and normalise ----------------------------------------------------------
df_txt = (
    pd.read_csv(SRC, usecols=['FREE_REMARKS'], low_memory=False)
      .fillna('')
      .assign(TXT=lambda d: d['FREE_REMARKS'].str.upper())
)

# 2) Pre-compile the regexes -----------------------------------------------------
RE_SIZE        = re.compile(r'\bX\s*([\d+\-.\s]*SQ\.?\s*MM)')            # 120SQ.MM  0.15+185+240SQ.MM
RE_INSULATION  = re.compile(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)')
RE_VOLTAGE     = re.compile(r'(\d{2,3})\s*KV')
RE_TYPE        = re.compile(r'(HTCF SECTION|LT SECTION|HT SECTION)')
RE_SECTION     = re.compile(
    r'BETWEEN\s+(.+?)\s+S/S\s+SW\.?ITCH:?\s*([0-9]+)\s+TO\s+(.+?)\s+S/S\s+SW\.?ITCH:?\s*([0-9]+)',
    flags=re.I
)
RE_DELAYED     = re.compile(
    r'(?:ISOLATION\s*&?\s*RESTORATION\s+)?DELAYED (?:AS|DUE TO)\s*(.*?)(?:NOTIFICATION NO|$)',
    flags=re.I
)

FAULT_TOKENS = [
    'CABLE DAMAGED BY', 'CABLE DAMAGED',     # covers slightly shorter variant
    'DMS OFFLINE AT', 'WENT OFFLINE',
    'FPI MALFUNCTION', 'FPI FAULTY',
    'FAILED TO OPEN',  'FAILED TO CLOSE',
    'FEEDER TRIPPING', 'TRAFFIC ON',
    'SUPPLY RESTORTION DELAYED', 'SUPPLY RESTORATION DELAYED'
]
# create one big alternation for speed
RE_FAULT = re.compile(
    '(' + '|'.join(re.escape(tok) for tok in FAULT_TOKENS) + r'.*?)(?:\.|,|;|NOTIFICATION NO|$)',
    flags=re.I
)

# 3) Utility ---------------------------------------------------------------------
def _first(regex, text, fmt=lambda m: m.group(1)):
    m = regex.search(text)
    return fmt(m) if m else ''

def _faults(text: str) -> str:
    return '; '.join(m.group(1).strip() for m in RE_FAULT.finditer(text))

# 4) Column extraction -----------------------------------------------------------
out = pd.DataFrame({
    'Size'       : df_txt['TXT'].apply(lambda t: _first(RE_SIZE, t, lambda m: m.group(1).replace(' ', ''))),
    'Insulation' : df_txt['TXT'].apply(lambda t: _first(RE_INSULATION, t)),
    'Voltage'    : df_txt['TXT'].apply(lambda t: _first(RE_VOLTAGE, t, lambda m: f"{m.group(1)}KV")),
    'Type'       : df_txt['TXT'].apply(lambda t: _first(RE_TYPE, t))
})

# --- FROM / TO / SWITCH columns -------------------------------------------------
sec = df_txt['TXT'].str.extract(RE_SECTION)
sec.columns = ['FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH']
out = pd.concat([out, sec], axis=1)

# --- delay / fault --------------------------------------------------------------
out['DELAYED_REASON'] = df_txt['TXT'].apply(lambda t: _first(RE_DELAYED, t).strip())
out['FAULT_NATURE']   = df_txt['TXT'].apply(_faults)

# 5) Final tidy-up & write -------------------------------------------------------
out = out[['Size','Insulation','Voltage','Type',
           'FROM','FROM_SWITCH','TO','TO_SWITCH',
           'DELAYED_REASON','FAULT_NATURE']]

out.to_csv(DEST, index=False)
print(f'✓ Parsed {len(out):,} records  →  {DEST}')
print(out.head(10).to_markdown())


✓ Parsed 24,228 records  →  /media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_ONLY_FREEREMARKS.csv
|    | Size                      | Insulation   | Voltage   | Type         | FROM                     |   FROM_SWITCH | TO                     |   TO_SWITCH | DELAYED_REASON                                                                                                                                                                                                                                                                                                                                                                    | FAULT_NATURE   |
|---:|:--------------------------|:-------------|:----------|:-------------|:-------------------------|--------------:|:-----------------------|------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------

ANALYZE THE DESCRIPTION COLUMN 

In [1]:
# Count the number of rows where the DESCRIPTION column contains the word "SWITCH"
import pandas as pd
df = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/DESCRIPTION.csv', low_memory=False)
rows_with_switch = df['DESCRIPTION'].str.contains(r'\bSWITCH\b', case=False, na=False).sum()
print(f'Number of rows containing the word "SWITCH": {rows_with_switch}')

Number of rows containing the word "SWITCH": 21584


In [2]:
len(df)  # Total number of rows in the DataFrame

23939

In [8]:

rows_with_switch = df['DESCRIPTION'].str.contains(r'\b11 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "11kV": {rows_with_switch}')

Number of rows containing the word "11kV": 19598


In [9]:


rows_with_switch_22 = df['DESCRIPTION'].str.contains(r'\b22 KV\b', case=False, na=False).sum()
row_with_33 = df['DESCRIPTION'].str.contains(r'\b33 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "22kV": {rows_with_switch_22}')
print(f'Number of rows containing the word "33kV": {row_with_33}')


Number of rows containing the word "22kV": 627
Number of rows containing the word "33kV": 1194


In [10]:
total = rows_with_switch + row_with_33 + rows_with_switch_22
print(f'Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": {total}')

Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": 21419


In [12]:
# Count the number of rows where the DESCRIPTION column contains the word "SWITCH"
import pandas as pd
df = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/TXN_NMS_HTLOGSHEET_FREE_REMARKS_ONLY_clean.csv', low_memory=False)
rows_with_switch = df['FREE_REMARKS'].str.contains(r'\bSWITCH\b', case=False, na=False).sum()
print(f'Number of rows containing the word "SWITCH": {rows_with_switch}')

Number of rows containing the word "SWITCH": 21749


In [14]:

rows_with_switch = df['FREE_REMARKS'].str.contains(r'\b11 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "11kV": {rows_with_switch}')


rows_with_switch_22 = df['FREE_REMARKS'].str.contains(r'\b22 KV\b', case=False, na=False).sum()
row_with_33 = df['FREE_REMARKS'].str.contains(r'\b33 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "22kV": {rows_with_switch_22}')
print(f'Number of rows containing the word "33kV": {row_with_33}')


Number of rows containing the word "11kV": 19777
Number of rows containing the word "22kV": 680
Number of rows containing the word "33kV": 1294


In [16]:
total = rows_with_switch + row_with_33 + rows_with_switch_22
print(f'Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": {total}')

Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": 21751


MAtching the feeder id with thE cleaned  DATA OF FAULT collmn SWITCH_NO

In [35]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv"
file1_column_name = "SWITCH_NO"

# File 2 Details
file2_path = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT.csv"
file2_column_name = "FEEDER_ID"



# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name):
    """
    Reads a file, extracts unique values from a specific column,
    cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip')
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip')
        
        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')
        
        cleaned_values = set(s.dropna().astype(int))
        
        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name)
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name)

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)
    
    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)
 

Processing file: /media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv...
  - Found 1268 unique, clean values.
Processing file: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT.csv...
  - Found 945 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv' (Column: SWITCH_NO): 1268
Unique values in '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT.csv' (Column: FEEDER_ID): 945
-------------------------
Number of values that matched: 824
-------------------------


In [32]:
matching_values_list = sorted(matching_values)
print("Matching values (sorted):")
output_file_path = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/order_id.csv"
matching_values_df = pd.DataFrame(matching_values_list, columns=[file1_column_name])
matching_values_df.to_csv(output_file_path, index=False)
print(f"Matching values saved to: {output_file_path}")

Matching values (sorted):
Matching values saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/order_id.csv


ORDER_ID AND SWITCH NO IN ORDER ID MATCH WITH SWITCH NO IS 89 BUT ORDER_ID MATCH THE FEDER _ID WITH 99

In [61]:
match = unique_values_from_file1.intersection(matching_values_list)
print(f"Number of values that matched: {len(match)}")


print(f"Total matching values: {len(matching_values_list)}")
match_list = sorted(match)

Number of values that matched: 89
Total matching values: 99


In [57]:
# Find values in match that are NOT in matching_values_list
diff_match_vs_matching_values_list = sorted(set(matching_values_list) - set(match))
print("Values in 'match' but not in 'matching_values_list':", diff_match_vs_matching_values_list)
print(f"Count: {len(diff_match_vs_matching_values_list)}")

Values in 'match' but not in 'matching_values_list': [26332, 28192, 29986, 30467, 30668, 32164, 32529, 32530, 34678, 39411]
Count: 10


In [9]:
import re
import pandas as pd

# Test on a small subset first
df = pd.read_csv('HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv', low_memory=False)

def extract_station_switch(remark):
    remark = str(remark)
    # Pattern 1: S/S SWITCH:
    pattern1 = (
        r'BETWEEN\s+(.+?)\s+S/S\s+SWITCH[:\s]*([A-Z0-9]+)\s+TO\s+(.+?)\s+S/S\s+SWITCH[:\s]*([A-Z0-9]+)'
    )
    m1 = re.search(pattern1, remark, re.IGNORECASE)
    if m1:
        # For debug: print matched groups
        print("Pattern1 matched:", m1.groups())
        return pd.Series([m1.group(1).strip(), m1.group(2).strip(), m1.group(3).strip(), m1.group(4).strip()])
    # Pattern 2: REC-STN SWITCH:
    pattern2 = (
        r'BETWEEN\s+(.+?)\s+REC[-\s]?STN\s+SWITCH[:\s]*([A-Z0-9]+)\s+TO\s+(.+?)\s+REC[-\s]?STN\s+SWITCH[:\s]*([A-Z0-9]+)'
    )
    m2 = re.search(pattern2, remark, re.IGNORECASE)
    if m2:
        print("Pattern2 matched:", m2.groups())
        return pd.Series([m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), m2.group(4).strip()])
    # Pattern 3: S/S SW.NO.
    pattern3 = (
        r'BETWEEN\s+(.+?)\s+S/S\s+SW\.NO\.[:\s]*([A-Z0-9]+)\s+TO\s+(.+?)\s+S/S\s+SW\.NO\.[:\s]*([A-Z0-9]+)'
    )
    m3 = re.search(pattern3, remark, re.IGNORECASE)
    if m3:
        print("Pattern3 matched:", m3.groups())
        return pd.Series([m3.group(1).strip(), m3.group(2).strip(), m3.group(3).strip(), m3.group(4).strip()])
    # If nothing matches, return blanks (never the full remark!)
    print("No match:", remark)
    return pd.Series(['', '', '', ''])

# Try on first 10 rows (for debug)
test_df = df.head(10).copy()
test_df[['SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH']] = test_df['FREE_REMARKS'].apply(extract_station_switch)

# See what gets printed for your rows!


Pattern1 matched: ('VISHWESHWAR NAGAR HETALI', '40478', 'SAMANT', '05587')
Pattern1 matched: ('JAWAHAR NAGAR NO.3', '13847', 'SONAVALA ESTATE NO.1', '00714')
No match: 1) 3C X 0.15+185+225+300+240 SQ. MM PILC+XLPE 11 KV HTCF SECTION BETWEEN GOREGAON REC-STN SWITCH:01523 TO SIDDHARTHA NAGAR NO.1 S/S SWITCH:02310. DMS USED FOR ISOLATION AND RESTORATION. ISOLATION & RESTORATION DELAYED DUE TO 1)FAILED TO CLOSE GOREGAON SHOPPING CENTRE S/S DMS SW.NO.04673 FAILED TO OPERATE FROM SCADA. NOTIFICATION NO:- 002001589689
Pattern1 matched: ('AAREY NO.2', '18066', 'AAREY UNIT NO.7 KIOSK', '18008')
Pattern1 matched: ('AAREY SANKRAMAN STUDIO', '19112', 'AAREY UNIT NO.2', '19710')
Pattern1 matched: ('AJIT PARK', '15725', 'TUREL PAKHADI NO.1', '06900')
Pattern1 matched: ('BABREKAR NAGAR NO.1', '18669', 'KANDIVLI HOUSING NO.3', '18671')
Pattern1 matched: ('BHAGATSINGH NAGAR NO.1', '28860', 'BHAGATSINGH NAGAR NO.3', '34127')
Pattern1 matched: ('DINDOSHI VASAHAT CENTRAL', '17141', 'DINDOSHI VASAHAT SOUTH

In [12]:
import pandas as pd
import re

def extract_station_switch_diag(remark):
    remark = str(remark)
    hybrid_pattern = (
        r'BETWEEN\s+(.+?)\s+(?:S/S|REC[-\s]?STN)\s+SWITCH[:\s]*([A-Z0-9]+)\s+TO\s+(.+?)\s+(?:S/S|REC[-\s]?STN)\s+SWITCH[:\s]*([A-Z0-9]+)'
    )
    m = re.search(hybrid_pattern, remark, re.IGNORECASE)
    if m:
        src_ss = re.sub(r'(?:S/S|REC[-\s]?STN)\s*$', '', m.group(1).strip(), flags=re.IGNORECASE)
        from_sw = m.group(2).strip()
        dst_ss = re.sub(r'(?:S/S|REC[-\s]?STN)\s*$', '', m.group(3).strip(), flags=re.IGNORECASE)
        to_sw = m.group(4).strip()
        # print(f"[MATCH] SOURCE_SS: '{src_ss}' | FROM_SWITCH: '{from_sw}' | DESTINATION_SS: '{dst_ss}' | TO_SWITCH: '{to_sw}'")
        return pd.Series([src_ss, from_sw, dst_ss, to_sw])
    else:
        print(f"[NO MATCH] REMARK: {remark}")
        return pd.Series(['', '', '', ''])

# Usage with your DataFrame
df = pd.read_csv('HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv', low_memory=False)
df[['SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH']] = df['FREE_REMARKS'].apply(extract_station_switch_diag)



[NO MATCH] REMARK: 1) 3C X 300 SQ. MM PILC 22 KV HTCF SECTION BETWEEN TATA S VIKHROLI REC-STN SWITCH:T22VIK05_10 TO TILAK NAGAR REC-STN SWITCH:33054. NOTIFICATION NO:- 002001590522
[NO MATCH] REMARK: 1) 3C X 300 SQ. MM XLPE 22 KV HTCF SECTION BETWEEN TATA S VIKHROLI REC-STN SWITCH:T22VIK02_03 TO VIKHROLI REC-STN SWITCH:01171. NOTIFICATION NO:- 002001610983
[NO MATCH] REMARK: 1) 3C X 300+400 SQ. MM PILC+XLPE 22 KV HTCF SECTION BETWEEN TATA S VIKHROLI REC-STN SWITCH:T22VIK07_04 TO TAGORE NGR. REC-STN SWITCH:01103. NOTIFICATION NO:- 002001644852
[NO MATCH] REMARK: 1) 3C X 400 SQ. MM XLPE 22 KV HTCF SECTION BETWEEN TATA S VIKHROLI REC-STN SWITCH:T22VIK06_01 TO VIKHROLI REC-STN SWITCH:01135. CABLE DAMAGED BY MCGM BRIGDE DEPT ON ANDHERI GHATKOPAR LINK RD OPP. GHAT BUS DEPT NEAR GODREJ COMPOUND WALL NEAR LAXMI NGR NALA. NOTIFICATION NO:- 002001646319
[NO MATCH] REMARK: 1) 3C X 400 SQ. MM XLPE 22 KV HTCF SECTION BETWEEN TATA S VIKHROLI REC-STN SWITCH:T22VIK07_04 TO SHIVAJI NAGAR REC-STN SWITCH

In [13]:
import pandas as pd
import re

def extract_station_switch_diag(remark):
    remark = str(remark)
    pattern = re.compile(
        r'BETWEEN\s+(.+?)(?:\s+(?:S/S|REC[\s-]?STN|R/S|S\.S\.))?\s+SWITCH[:\s]*([A-Z0-9_]+)\s+TO\s+(.+?)(?:\s+(?:S/S|REC[\s-]?STN|R/S|S\.S\.))?\s+SWITCH[:\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    m = pattern.search(remark)
    if m:
        src_ss = m.group(1).strip()
        from_sw = m.group(2).strip()
        dst_ss = m.group(3).strip()
        to_sw = m.group(4).strip()
        # print(f"[MATCH] SOURCE_SS: '{src_ss}' | FROM_SWITCH: '{from_sw}' | DESTINATION_SS: '{dst_ss}' | TO_SWITCH: '{to_sw}'")
        return pd.Series([src_ss, from_sw, dst_ss, to_sw])
    else:
        print(f"[NO MATCH] REMARK: {remark}")
        return pd.Series(['', '', '', ''])

# Usage with your DataFrame:
df = pd.read_csv('HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv', low_memory=False)
df[['SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH']] = df['FREE_REMARKS'].apply(extract_station_switch_diag)


[NO MATCH] REMARK: nan
[NO MATCH] REMARK: 1) 3C X 400 SQ. MM XLPE 33 KV HTCF SECTION BETWEEN TO OSHIWARA RECEIVING STATION S/S SWITCH:33294. DMS USED FOR ISOLATION AND RESTORATION. NOTIFICATION NO:- 002002044113
[NO MATCH] REMARK: nan
[NO MATCH] REMARK: nan
[NO MATCH] REMARK: NOTIFICATION NO:- 1
[NO MATCH] REMARK: nan
[NO MATCH] REMARK: TRIPPED DUE TO R-PHASE JUMPER BURNT AT TOWER 76A MONOPOLE.. RESTORATION DELAYED DUE TO 1)FAILED TO CLOSE VIRVANI IND.NO.1 S/S DMS SW.NO.05168 FAILED TO OPERATE FROM SCADA. 2)FAILED TO CLOSE GANDHI NAGAR NO.4 S/S DMS SW.NO.11378 FAILED TO OPERATE FROM SCADA. 3) DMS OFFLINE AT OBEROI MALL S/S. RESTORATION DELAYED DUE TO ONGOING OUTAGE OF 33KV AEML GOREGAON FEEDER 14 ( DINDOSHI 20 MVA-T2+ OMKAR 20 MVA-T2+NIRLON 20 MVA-T3+ NESCO 20MVA-T1).
[NO MATCH] REMARK: NOTIFICATION NO:- 111
[NO MATCH] REMARK: NO FAULT AFTER TESTING. NOTIFICATION NO:- 002001284566
[NO MATCH] REMARK: nan
[NO MATCH] REMARK: 1) 3C X SQ. MM PILC+XLPE 11 KV HTCF SECTION BETWEEN 38664 S/S SW

Pattern with maximum match and analye the free_remark and extract the switch_ no and station


In [31]:
import pandas as pd
import re
import numpy as np

# Load your data
input_file = 'HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv'
output_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed.csv'
no_match_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_no_match.csv'

df = pd.read_csv(input_file, low_memory=False)

# Extraction function
def extract_remark_info(text):
    if pd.isnull(text):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
    # STD_CABLE_SIZE
    cable_match = re.search(r'(\d{1,4}\s*(?:\+|\s*x\s*)?\s*\d{1,4}(?:\+\d{1,4})*?)\s*SQ\.*\s*MM', text.upper())
    cable_size = cable_match.group(1).replace(' ', '') if cable_match else np.nan
    # VOLTAGE
    voltage_match = re.search(r'(\d{2,3}\s*KV)', text.upper())
    voltage = voltage_match.group(1).replace(' ', '') if voltage_match else np.nan
    # CABLE_TYPE extraction
    cable_type_match = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text.upper())
    cable_type = cable_type_match.group(1) if cable_type_match else np.nan
    # Main pattern
    pattern = re.compile(
        r'BETWEEN\s+(.*?)\s*(?:S/S)?\s*SWITCH[:\-\s]*([A-Z0-9_]+).*?TO\s+(.*?)\s*(?:S/S)?\s*SWITCH[:\-\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    match = pattern.search(text)
    if match:
        src, from_sw, dst, to_sw = match.groups()
        return pd.Series([src.strip(), from_sw.strip(), dst.strip(), to_sw.strip(), voltage, cable_size, cable_type])
    # Try alternate pattern
    pattern_alt = re.compile(
        r'BETWEEN\s+(.*?)\s*(?:S/S)?\s*SWITCH[\:\-\s]*([A-Z0-9_]+)\s*TO\s+(.*?)\s*(?:S/S)?\s*SWITCH[\:\-\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    match_alt = pattern_alt.search(text)
    if match_alt:
        src, from_sw, dst, to_sw = match_alt.groups()
        return pd.Series([src.strip(), from_sw.strip(), dst.strip(), to_sw.strip(), voltage, cable_size, cable_type])
    # No match
    return pd.Series([np.nan, np.nan, np.nan, np.nan, voltage, cable_size, cable_type])


# Apply extraction
df[['SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH', 'VOLTAGE', 'STD_CABLE_SIZE', 'CABLE_TYPE']] = (
    df['FREE_REMARKS'].apply(extract_remark_info)
)


# Calculate TIME_DIFFERENCE_HOURS
df['TIME_OUTAGE'] = pd.to_datetime(df['TIME_OUTAGE'], errors='coerce')
df['TIME_RESTORED'] = pd.to_datetime(df['TIME_RESTORED'], errors='coerce')
df['TIME_DIFFERENCE_HOURS'] = (df['TIME_RESTORED'] - df['TIME_OUTAGE']).dt.total_seconds() / 3600

# Columns for output
final_cols = [
    'SWITCH_NO', 'STATION_NAME', 'STD_CABLE_SIZE', 'CABLE_TYPE', 'TIME_OUTAGE',
    'SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH',
    'VOLTAGE', 'TIME_DIFFERENCE_HOURS'
]

out_df = df[final_cols]
out_df.to_csv(output_file, index=False)
print("Done! Output saved to:", output_file)

# --- Extract NO-MATCH CASES ---

# Rows where nothing was matched for station or switch info
no_match_mask = (
    df['SOURCE_SS'].isnull() &
    df['FROM_SWITCH'].isnull() &
    df['DESTINATION_SS'].isnull() &
    df['TO_SWITCH'].isnull()
)
no_match_df = df.loc[no_match_mask, ['FREE_REMARKS', 'VOLTAGE', 'STD_CABLE_SIZE']]
no_match_df.to_csv(no_match_file, index=False)
print("No-match rows saved to:", no_match_file)


Done! Output saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed.csv
No-match rows saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_no_match.csv


Fault nature using the basic nlp 

In [56]:
import pandas as pd
import re
import numpy as np

# Load your data
input_file = 'HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv'
output_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed.csv'
no_match_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_no_match.csv'

df = pd.read_csv(input_file, low_memory=False)

def extract_remark_info(text):
    if pd.isnull(text):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
    cable_match = re.search(r'(\d{1,4}\s*(?:\+|\s*x\s*)?\s*\d{1,4}(?:\+\d{1,4})*?)\s*SQ\.*\s*MM', text.upper())
    cable_size = cable_match.group(1).replace(' ', '') if cable_match else np.nan
    voltage_match = re.search(r'(\d{2,3}\s*KV)', text.upper())
    voltage = voltage_match.group(1).replace(' ', '') if voltage_match else np.nan
    cable_type_match = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text.upper())
    cable_type = cable_type_match.group(1) if cable_type_match else np.nan
    pattern = re.compile(
        r'BETWEEN\s+(.*?)\s*(?:S/S)?\s*SWITCH[:\-\s]*([A-Z0-9_]+).*?TO\s+(.*?)\s*(?:S/S)?\s*SWITCH[:\-\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    match = pattern.search(text)
    if match:
        src, from_sw, dst, to_sw = match.groups()
        return pd.Series([src.strip(), from_sw.strip(), dst.strip(), to_sw.strip(), voltage, cable_size, cable_type])
    pattern_alt = re.compile(
        r'BETWEEN\s+(.*?)\s*(?:S/S)?\s*SWITCH[\:\-\s]*([A-Z0-9_]+)\s*TO\s+(.*?)\s*(?:S/S)?\s*SWITCH[\:\-\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    match_alt = pattern_alt.search(text)
    if match_alt:
        src, from_sw, dst, to_sw = match_alt.groups()
        return pd.Series([src.strip(), from_sw.strip(), dst.strip(), to_sw.strip(), voltage, cable_size, cable_type])
    return pd.Series([np.nan, np.nan, np.nan, np.nan, voltage, cable_size, cable_type])

df[['SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH', 'VOLTAGE', 'STD_CABLE_SIZE', 'CABLE_TYPE']] = (
    df['FREE_REMARKS'].apply(extract_remark_info)
)

df['TIME_OUTAGE'] = pd.to_datetime(df['TIME_OUTAGE'], errors='coerce')
df['TIME_RESTORED'] = pd.to_datetime(df['TIME_RESTORED'], errors='coerce')
df['TIME_DIFFERENCE_HOURS'] = (df['TIME_RESTORED'] - df['TIME_OUTAGE']).dt.total_seconds() / 3600

# --- NEW: Extract all delay points as separate rows ---

def extract_delay_points(text):
    # Find "DELAYED DUE TO" and split points 1), 2), ...
    m = re.search(r'DELAYED DUE TO(.*?)(?:NOTIFICATION NO|$)', str(text), re.IGNORECASE)
    if m:
        points = re.split(r'\d+\)', m.group(1))
        return [p.strip('.; \n') for p in points if p.strip()]
    return []

def extract_affected_station_switch(point):
    # Try full pattern with station and switch
    m = re.search(r'([A-Z0-9 .\-]+?)\s+S/S(?:.*?)SW\.?NO\.?([0-9]+)', point)
    if m:
        return m.group(1).strip(), m.group(2)
    # Fallback: pattern with only station (no switch)
    m2 = re.search(r'([A-Z0-9 .\-]+?)\s+S/S', point)
    if m2:
        return m2.group(1).strip(), np.nan
    # Fallback: pattern like "DMS OFFLINE AT <STATION>"
    m3 = re.search(r'OFFLINE AT ([A-Z0-9 .\-]+?)\s+S/S', point)
    if m3:
        return m3.group(1).strip(), np.nan
    return np.nan, np.nan


def categorize_reason(point):
    point_l = point.lower()
    cats = []
    if "failed to operate from scada" in point_l or "scada failure" in point_l or "scada offline" in point_l or "dms offline" in point_l:
        cats.append("SCADA FAILURE")
    if "failed to open" in point_l or "failed to close" in point_l or "switch failure" in point_l:
        cats.append("SWITCH FAILURE")
    if "tripping" in point_l or "went offline after tripping" in point_l:
        cats.append("TRIPPING")
    if "fpi malfunction" in point_l or "fpi faulty" in point_l:
        cats.append("FPI FAULT")
    if "cable damaged" in point_l or "cable fault" in point_l:
        cats.append("CABLE FAULT")
    if "access problem" in point_l or "gate opening issue" in point_l:
        cats.append("ACCESS ISSUE")
    if "non dms substation" in point_l:
        cats.append("NON DMS AREA")
    if not cats and "isolation" in point_l:
        cats.append("NORMAL ISOLATION")
    return "+".join(cats) if cats else "UNKNOWN"


all_rows = []

for idx, row in df.iterrows():
    base_info = {
        'SWITCH_NO': row.get('SWITCH_NO'),
        'STATION_NAME': row.get('STATION_NAME'),
        'STD_CABLE_SIZE': row.get('STD_CABLE_SIZE'),
        'CABLE_TYPE': row.get('CABLE_TYPE'),
        'TIME_OUTAGE': row.get('TIME_OUTAGE'),
        'SOURCE_SS': row.get('SOURCE_SS'),
        'FROM_SWITCH': row.get('FROM_SWITCH'),
        'DESTINATION_SS': row.get('DESTINATION_SS'),
        'TO_SWITCH': row.get('TO_SWITCH'),
        'VOLTAGE': row.get('VOLTAGE'),
        'TIME_DIFFERENCE_HOURS': row.get('TIME_DIFFERENCE_HOURS'),
    }
    points = extract_delay_points(row['FREE_REMARKS'])
    if points:
        for p in points:
            affected_station, affected_switch = extract_affected_station_switch(p)
            category = categorize_reason(p)
            this_row = base_info.copy()
            this_row['AFFECTED_STATION'] = affected_station
            this_row['AFFECTED_SWITCH'] = affected_switch
            this_row['REASON_CATEGORY'] = category
            this_row['REASON_TEXT'] = p
            all_rows.append(this_row)
    else:
        # No delay points, keep at least one row
        this_row = base_info.copy()
        this_row['AFFECTED_STATION'] = np.nan
        this_row['AFFECTED_SWITCH'] = np.nan
        this_row['REASON_CATEGORY'] = np.nan
        this_row['REASON_TEXT'] = np.nan
        all_rows.append(this_row)

# Output DataFrame
result_df = pd.DataFrame(all_rows)
result_df.to_csv(output_file, index=False)
print("Done! Output saved to:", output_file)

# --- Extract NO-MATCH CASES ---

no_match_mask = (
    df['SOURCE_SS'].isnull() &
    df['FROM_SWITCH'].isnull() &
    df['DESTINATION_SS'].isnull() &
    df['TO_SWITCH'].isnull()
)
no_match_df = df.loc[no_match_mask, ['FREE_REMARKS', 'VOLTAGE', 'STD_CABLE_SIZE']]
no_match_df.to_csv(no_match_file, index=False)
print("No-match rows saved to:", no_match_file)


Done! Output saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed.csv
No-match rows saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_no_match.csv


In [44]:
# This will show you all rows where at least one field did not get filled (NaN or empty)
missing_pattern_rows = result_df[
    result_df[['AFFECTED_STATION', 'AFFECTED_SWITCH', 'REASON_CATEGORY', 'REASON_TEXT']]
    .isnull().any(axis=1)
    | (result_df['AFFECTED_STATION'].astype(str).str.strip() == "")
    | (result_df['REASON_CATEGORY'].astype(str).str.strip() == "")
    | (result_df['REASON_TEXT'].astype(str).str.strip() == "")
]

print(f"Rows with missing or unmatched pattern: {len(missing_pattern_rows)}")
display(missing_pattern_rows['REASON_TEXT'].head(100))  # See the first 10, for inspection

# Optionally, save them for inspection
# missing_pattern_rows.to_csv("HT_fault_cable_info_missing_pattern.csv", index=False)


Rows with missing or unmatched pattern: 14034


0                                                    NaN
3      KASTURI KUNJ S/S (DMS) WENT OFFLINE AFTER TRIP...
5      AAREY NO.2 S/S (DMS) WENT OFFLINE AFTER GIVING...
6                FPI FAULTY AT AAREY UNIT NO.7 KIOSK S/S
7      NON DMS SUBSTATION ARE IN ARREY AREA. DMS USED...
                             ...                        
155                                                  NaN
156           DMS OFFLINE AT GANESH NAGAR MHADA NO.3 S/S
157                        DMS OFFLINE AT VASTU PARK S/S
159     VASTU PARK S/S (DMS) WENT OFFLINE AFTER TRIPPING
160                      DMS OFFLINE AT VISHAL NAGAR S/S
Name: REASON_TEXT, Length: 100, dtype: object

In [53]:
import pandas as pd
import numpy as np
import re

station_master_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/ss_unique.csv'
switch_master_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/switch.csv'
output_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv'

# Load data
result_df = pd.read_csv(output_file)
station_master = pd.read_csv(station_master_file)
switch_master = pd.read_csv(switch_master_file)

all_station_names = station_master['SOURCE_SS'].dropna().str.upper().str.strip().tolist()
# Assume switch numbers are strings (so leading zeros are preserved)
all_switch_numbers = switch_master['0'].dropna().astype(str).str.strip().tolist()

# Precompile regex patterns
station_patterns = [(name, re.compile(r'\b' + re.escape(name) + r'\b')) for name in all_station_names]
switch_patterns = [(sw, re.compile(r'\b' + re.escape(sw) + r'\b')) for sw in all_switch_numbers]

def match_station_names(text, station_patterns):
    if pd.isnull(text):
        return np.nan
    text_upper = str(text).upper()
    found = [name for name, pattern in station_patterns if pattern.search(text_upper)]
    return '; '.join(found) if found else np.nan

def match_switch_numbers(text, switch_patterns):
    if pd.isnull(text):
        return np.nan
    text_upper = str(text).upper()
    found = [sw for sw, pattern in switch_patterns if pattern.search(text_upper)]
    return '; '.join(found) if found else np.nan

# Apply matching
result_df['AFFECTED_STATION'] = result_df['REASON_TEXT'].apply(lambda x: match_station_names(x, station_patterns))
result_df['AFFECTED_SWITCH'] = result_df['REASON_TEXT'].apply(lambda x: match_switch_numbers(x, switch_patterns))

result_df.to_csv(output_file, index=False)
print("Updated AFFECTED_STATION and AFFECTED_SWITCH with master list matches and saved to:", output_file)


KeyboardInterrupt: 

In [58]:
import pandas as pd
import numpy as np
import re

# File paths
input_file = 'HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv'
output_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv'
no_match_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_no_match2.csv'
station_master_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/ss_unique.csv'
switch_master_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/switch.csv'

# Load master lists
station_master = pd.read_csv(station_master_file)
switch_master = pd.read_csv(switch_master_file)
all_station_names = station_master['SOURCE_SS'].dropna().str.upper().str.strip().tolist()
all_switch_numbers = switch_master['0'].dropna().astype(str).str.strip().tolist()

# Helper fallback matchers
def match_station_names(text, all_station_names):
    if pd.isnull(text): return np.nan
    text_upper = str(text).upper()
    found = [name for name in all_station_names if name in text_upper]
    return '; '.join(found) if found else np.nan

def match_switch_numbers(text, all_switch_numbers):
    if pd.isnull(text):
        return np.nan
    text_upper = str(text).upper()
    found = [sw for sw in all_switch_numbers if sw in text_upper]
    # Convert to int, get max, return as string
    found_digits = [int(sw) for sw in found if sw.isdigit()]
    if found_digits:
        return str(max(found_digits))
    return np.nan


df = pd.read_csv(input_file, low_memory=False)

def extract_remark_info(text):
    if pd.isnull(text):
        return pd.Series([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
    cable_match = re.search(r'(\d{1,4}\s*(?:\+|\s*x\s*)?\s*\d{1,4}(?:\+\d{1,4})*?)\s*SQ\.*\s*MM', text.upper())
    cable_size = cable_match.group(1).replace(' ', '') if cable_match else np.nan
    voltage_match = re.search(r'(\d{2,3}\s*KV)', text.upper())
    voltage = voltage_match.group(1).replace(' ', '') if voltage_match else np.nan
    cable_type_match = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text.upper())
    cable_type = cable_type_match.group(1) if cable_type_match else np.nan
    pattern = re.compile(
        r'BETWEEN\s+(.*?)\s*(?:S/S)?\s*SWITCH[:\-\s]*([A-Z0-9_]+).*?TO\s+(.*?)\s*(?:S/S)?\s*SWITCH[:\-\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    match = pattern.search(text)
    if match:
        src, from_sw, dst, to_sw = match.groups()
        return pd.Series([src.strip(), from_sw.strip(), dst.strip(), to_sw.strip(), voltage, cable_size, cable_type])
    pattern_alt = re.compile(
        r'BETWEEN\s+(.*?)\s*(?:S/S)?\s*SWITCH[\:\-\s]*([A-Z0-9_]+)\s*TO\s+(.*?)\s*(?:S/S)?\s*SWITCH[\:\-\s]*([A-Z0-9_]+)',
        re.IGNORECASE
    )
    match_alt = pattern_alt.search(text)
    if match_alt:
        src, from_sw, dst, to_sw = match_alt.groups()
        return pd.Series([src.strip(), from_sw.strip(), dst.strip(), to_sw.strip(), voltage, cable_size, cable_type])
    return pd.Series([np.nan, np.nan, np.nan, np.nan, voltage, cable_size, cable_type])

df[['SOURCE_SS', 'FROM_SWITCH', 'DESTINATION_SS', 'TO_SWITCH', 'VOLTAGE', 'STD_CABLE_SIZE', 'CABLE_TYPE']] = (
    df['FREE_REMARKS'].apply(extract_remark_info)
)

df['TIME_OUTAGE'] = pd.to_datetime(df['TIME_OUTAGE'], errors='coerce')
df['TIME_RESTORED'] = pd.to_datetime(df['TIME_RESTORED'], errors='coerce')
df['TIME_DIFFERENCE_HOURS'] = (df['TIME_RESTORED'] - df['TIME_OUTAGE']).dt.total_seconds() / 3600

def extract_delay_points(text):
    m = re.search(r'DELAYED DUE TO(.*?)(?:NOTIFICATION NO|$)', str(text), re.IGNORECASE)
    if m:
        points = re.split(r'\d+\)', m.group(1))
        return [p.strip('.; \n') for p in points if p.strip()]
    return []

def extract_affected_station_switch(point):
    m = re.search(r'([A-Z0-9 .\-]+?)\s+S/S(?:.*?)SW\.?NO\.?([0-9]+)', point)
    if m:
        return m.group(1).strip(), m.group(2)
    m2 = re.search(r'([A-Z0-9 .\-]+?)\s+S/S', point)
    if m2:
        return m2.group(1).strip(), np.nan
    m3 = re.search(r'OFFLINE AT ([A-Z0-9 .\-]+?)\s+S/S', point)
    if m3:
        return m3.group(1).strip(), np.nan
    return np.nan, np.nan

def categorize_reason(point):
    point_l = point.lower()
    cats = []
    if "failed to operate from scada" in point_l or "scada failure" in point_l or "scada offline" in point_l or "dms offline" in point_l:
        cats.append("SCADA FAILURE")
    if "failed to open" in point_l or "failed to close" in point_l or "switch failure" in point_l:
        cats.append("SWITCH FAILURE")
    if "tripping" in point_l or "went offline after tripping" in point_l:
        cats.append("TRIPPING")
    if "fpi malfunction" in point_l or "fpi faulty" in point_l:
        cats.append("FPI FAULT")
    if "cable damaged" in point_l or "cable fault" in point_l:
        cats.append("CABLE FAULT")
    if "access problem" in point_l or "gate opening issue" in point_l:
        cats.append("ACCESS ISSUE")
    if "non dms substation" in point_l:
        cats.append("NON DMS AREA")
    if not cats and "isolation" in point_l:
        cats.append("NORMAL ISOLATION")
    return "+".join(cats) if cats else "UNKNOWN"

all_rows = []

for idx, row in df.iterrows():
    base_info = {
        'SWITCH_NO': row.get('SWITCH_NO'),
        'STATION_NAME': row.get('STATION_NAME'),
        'STD_CABLE_SIZE': row.get('STD_CABLE_SIZE'),
        'CABLE_TYPE': row.get('CABLE_TYPE'),
        'TIME_OUTAGE': row.get('TIME_OUTAGE'),
        'SOURCE_SS': row.get('SOURCE_SS'),
        'FROM_SWITCH': row.get('FROM_SWITCH'),
        'DESTINATION_SS': row.get('DESTINATION_SS'),
        'TO_SWITCH': row.get('TO_SWITCH'),
        'VOLTAGE': row.get('VOLTAGE'),
        'TIME_DIFFERENCE_HOURS': row.get('TIME_DIFFERENCE_HOURS'),
    }
    points = extract_delay_points(row['FREE_REMARKS'])
    if points:
        for p in points:
            affected_station, affected_switch = extract_affected_station_switch(p)
            # Fallback: if not matched, try master list
            if pd.isnull(affected_station) or affected_station == '':
                affected_station = match_station_names(p, all_station_names)
            if pd.isnull(affected_switch) or affected_switch == '':
                affected_switch = match_switch_numbers(p, all_switch_numbers)
            category = categorize_reason(p)
            this_row = base_info.copy()
            this_row['AFFECTED_STATION'] = affected_station
            this_row['AFFECTED_SWITCH'] = affected_switch
            this_row['REASON_CATEGORY'] = category
            this_row['REASON_TEXT'] = p
            all_rows.append(this_row)
    else:
        this_row = base_info.copy()
        this_row['AFFECTED_STATION'] = np.nan
        this_row['AFFECTED_SWITCH'] = np.nan
        this_row['REASON_CATEGORY'] = np.nan
        this_row['REASON_TEXT'] = np.nan
        all_rows.append(this_row)

result_df = pd.DataFrame(all_rows)
result_df.to_csv(output_file, index=False)
print("Done! Output saved to:", output_file)

# --- Extract NO-MATCH CASES ---

no_match_mask = (
    df['SOURCE_SS'].isnull() &
    df['FROM_SWITCH'].isnull() &
    df['DESTINATION_SS'].isnull() &
    df['TO_SWITCH'].isnull()
)
no_match_df = df.loc[no_match_mask, ['FREE_REMARKS', 'VOLTAGE', 'STD_CABLE_SIZE']]
no_match_df.to_csv(no_match_file, index=False)
print("No-match rows saved to:", no_match_file)


Done! Output saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv
No-match rows saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_no_match2.csv
