CHECK THE DATA 

In [24]:
import csv

file_path = '/media/sagarkumar/New Volume/SAGAR/Book 2(Sheet1).csv'
linebreak_count = 0
ctrl_count = 0

with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        for j, cell in enumerate(row):
            if '\n' in cell or '\r' in cell:
                print(f'Linebreak found in row {i+1}, column {j+1}: {repr(cell)}')
                linebreak_count += 1
            # Any ASCII control char except tab (9) and newline (10, 13)
            if any(ord(c) < 32 and ord(c) not in (9, 10, 13) for c in cell):
                print(f'Ctrl char found in row {i+1}, column {j+1}: {repr(cell)}')
                ctrl_count += 1
      

print(f'Total linebreak cells found: {linebreak_count}')
print(f'Total control-char cells found: {ctrl_count}')


Total linebreak cells found: 0
Total control-char cells found: 0


In [25]:
with open(file_path, 'r', encoding='utf-8') as f:
    expected = None
    for i, line in enumerate(f):
        cols = line.count(',') + 1
        if i == 0:
            expected = cols
        elif cols != expected:
            print(f"Row {i+1} has {cols} columns (Expected: {expected})")
      

In [26]:
import pandas as pd
import re

# Raw file read, no parse (just lines)
filename = "/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv"
with open(filename, encoding='utf-8', errors='ignore') as f:
    lines = f.readlines()

print("Total lines:", len(lines))

# Line breaks within a field detection (very likely in 'FREE_REMARKS' column)
for i, line in enumerate(lines):
    if '\n' in line or '\r' in line:
        # \n to har line me hoga, but internal bhi ho sakta hai
        if line.count('\n') > 1 or line.count('\r') > 1:
            print(f"Line {i+1}: Multiple linebreaks in a single line")

    # Look for possible tabs, weird ascii
    if re.search(r"[\t\x0b\x0c\x1b]", line):
        print(f"Line {i+1}: Contains tab or control char")

# Check for unclosed quotes
for i, line in enumerate(lines):
    if line.count('"') % 2 != 0:
        print(f"Line {i+1}: Unmatched double quote")

# Column count (by comma)
col_counts = [l.count(',') for l in lines]
mode_col_count = max(set(col_counts), key=col_counts.count)
for idx, c in enumerate(col_counts):
    if c != mode_col_count:
        print(f"Line {idx+1}: {c} columns (Expected: {mode_col_count})")


Total lines: 11058


CLEANING THE DATA

In [16]:
import csv
import re

input_file_path = '/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1.csv'
output_file_path = '/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv'

def clean_cell(cell):
    # Remove any ASCII control characters except tab (9), LF (10), CR (13)
    cell = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', cell)
    # Replace linebreaks within a cell with space
    cell = cell.replace('\n', ' ').replace('\r', ' ')
    return cell

with open(input_file_path, 'r', encoding='utf-8', errors='replace', newline='') as infile, \
     open(output_file_path, 'w', encoding='utf-8', errors='replace', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)
    for row in reader:
        new_row = [clean_cell(cell) for cell in row]
        writer.writerow(new_row)

print(f"Cleaned file written to {output_file_path}")


Cleaned file written to /media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv


PROCCESING THE DATA 

In [None]:
import pandas as pd
import re

# 1. Load file (final clean file)
df_original = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/TXN_NMS_HTLOGSHEET_cleaned.csv', low_memory=False)

# 2. Filter ENTRY_TYPE == 1
df_entry1 = df_original[df_original['ENTRY_TYPE'] == 1].copy().reset_index(drop=True)

# 3. UPPERCASE remarks
df_entry1['FREE_REMARKS_UPPER'] = df_entry1['FREE_REMARKS'].astype(str).str.upper()

# 4. Extraction functions
def extract_size(text):
    m = re.search(r'X\s*([\d\.\+\-\s]*SQ\.?\s*MM)', text)
    return m.group(1).replace(" ", "") if m else ""

def extract_insulation(text):
    m = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text)
    return m.group(1) if m else ""

def extract_voltage(text):
    m = re.search(r'(\d{2,3})\s*KV', text)
    return f"{m.group(1)}KV" if m else ""

def extract_type(text):
    m = re.search(r'(HTCF SECTION|LT SECTION|HT SECTION)', text)
    return m.group(1) if m else ""

def extract_section(text):
    m = re.search(r'BETWEEN\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)\s+TO\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)', text)
    if m:
        return pd.Series([m.group(1).strip(), m.group(2), m.group(3).strip(), m.group(4)])
    return pd.Series(["", "", "", ""])

def extract_delayed_reason(text):
    m = re.search(r'DELAYED DUE TO(.*?)(?:NOTIFICATION NO|$)', text)
    return m.group(1).strip() if m else ""

def extract_fault_nature(text):
    patterns = [
        'CABLE DAMAGED BY', 'DMS OFFLINE AT', 'FPI MALFUNCTION',
        'FAILED TO OPEN', 'FAILED TO CLOSE', 'WENT OFFLINE',
        'FEEDER TRIPPING', 'TRAFFIC ON', 'SUPPLY RESTORATION DELAYED'
    ]
    found = []
    for pat in patterns:
        for m in re.finditer(r'({0}.*?)(?:\.|,|;|NOTIFICATION NO|$)'.format(re.escape(pat)), text):
            found.append(m.group(1).strip())
    return '; '.join(found) if found else ""

# 5. Extraction (create new DataFrame for output)
out = pd.DataFrame()
out['Size'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_size)
out['Insulation'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_insulation)
out['Voltage'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_voltage)
out['Type'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_type)

section_cols = df_entry1['FREE_REMARKS_UPPER'].apply(extract_section)
section_cols.columns = ['FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH']
out = pd.concat([out, section_cols], axis=1)

out['DELAYED_REASON'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_delayed_reason)
out['FAULT_NATURE'] = df_entry1['FREE_REMARKS_UPPER'].apply(extract_fault_nature)

# Time columns
out['TIME_OUTAGE'] = df_entry1['TIME_OUTAGE'].astype(str)
out['MAIN_REPORTED_TIME'] = pd.to_datetime(df_entry1['MAIN_REPORTED_TIME'], errors='coerce')
out['TIME_RESTORED'] = pd.to_datetime(df_entry1['TIME_RESTORED'], errors='coerce')
out['TIME_DIFFERENCE'] = (out['TIME_RESTORED'] - out['MAIN_REPORTED_TIME']).dt.total_seconds() / 60

# 6. Save
final_cols = [
    'Size', 'Insulation', 'Voltage', 'Type',
    'FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH',
    'DELAYED_REASON', 'FAULT_NATURE',
    'TIME_OUTAGE', 'MAIN_REPORTED_TIME', 'TIME_RESTORED', 'TIME_DIFFERENCE'
]
out[final_cols].to_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_FINAL.csv', index=False)
print("Extracted file written.")
print(out[final_cols].head(10))


ONLY FREE_REMARKS ANALYZE


In [30]:
import pandas as pd
import re

# 1. Load the file
df = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/TXN_NMS_HTLOGSHEET_FREE_REMARKS_ONLY.csv', usecols=['FREE_REMARKS'], low_memory=False)

# 2. UPPERCASE remarks for consistency
df['FREE_REMARKS_UPPER'] = df['FREE_REMARKS'].astype(str).str.upper()

# 3. Extraction functions
def extract_size(text):
    m = re.search(r'X\s*([\d\.\+\-\s]*SQ\.?\s*MM)', text)
    return m.group(1).replace(" ", "") if m else ""

def extract_insulation(text):
    m = re.search(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)', text)
    return m.group(1) if m else ""

def extract_voltage(text):
    m = re.search(r'(\d{2,3})\s*KV', text)
    return f"{m.group(1)}KV" if m else ""

def extract_type(text):
    m = re.search(r'(HTCF SECTION|LT SECTION|HT SECTION)', text)
    return m.group(1) if m else ""

def extract_section(text):
    m = re.search(r'BETWEEN\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)\s+TO\s+(.+?)\s+S/S\s+SWITCH:([0-9]+)', text)
    if m:
        return pd.Series([m.group(1).strip(), m.group(2), m.group(3).strip(), m.group(4)])
    return pd.Series(["", "", "", ""])

def extract_delayed_reason(text):
    m = re.search(r'DELAYED DUE TO(.*?)(?:NOTIFICATION NO|$)', text)
    return m.group(1).strip() if m else ""

def extract_fault_nature(text):
    patterns = [
        'CABLE DAMAGED BY', 'DMS OFFLINE AT', 'FPI MALFUNCTION',
        'FAILED TO OPEN', 'FAILED TO CLOSE', 'WENT OFFLINE',
        'FEEDER TRIPPING', 'TRAFFIC ON', 'SUPPLY RESTORATION DELAYED'
    ]
    found = []
    for pat in patterns:
        for m in re.finditer(r'({0}.*?)(?:\.|,|;|NOTIFICATION NO|$)'.format(re.escape(pat)), text):
            found.append(m.group(1).strip())
    return '; '.join(found) if found else ""

# 4. Apply extraction functions
out = pd.DataFrame()
out['Size'] = df['FREE_REMARKS_UPPER'].apply(extract_size)
out['Insulation'] = df['FREE_REMARKS_UPPER'].apply(extract_insulation)
out['Voltage'] = df['FREE_REMARKS_UPPER'].apply(extract_voltage)
out['Type'] = df['FREE_REMARKS_UPPER'].apply(extract_type)

section_cols = df['FREE_REMARKS_UPPER'].apply(extract_section)
section_cols.columns = ['FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH']
out = pd.concat([out, section_cols], axis=1)

out['DELAYED_REASON'] = df['FREE_REMARKS_UPPER'].apply(extract_delayed_reason)
out['FAULT_NATURE'] = df['FREE_REMARKS_UPPER'].apply(extract_fault_nature)

# 5. Save results
out.to_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_ONLY_FREEREMARKS.csv', index=False)
print("Extracted file written.")
print(out.head(10))


Extracted file written.
                        Size Insulation Voltage          Type  \
0                                         220KV                 
1                                                               
2                      SQ.MM  PILC+XLPE    11KV  HTCF SECTION   
3            .06+70+120SQ.MM  PILC+XLPE    11KV  HTCF SECTION   
4  0.15+185+225+300+240SQ.MM  PILC+XLPE    11KV  HTCF SECTION   
5                   120SQ.MM       PILC    11KV  HTCF SECTION   
6                   120SQ.MM       PILC    11KV  HTCF SECTION   
7                   120SQ.MM       PILC    11KV  HTCF SECTION   
8                   120SQ.MM       PILC    11KV  HTCF SECTION   
9                   120SQ.MM       PILC    11KV  HTCF SECTION   

                       FROM FROM_SWITCH                      TO TO_SWITCH  \
0                                                                           
1                                                                           
2  VISHWESHWAR NAGAR HETALI  

In [9]:
#!/usr/bin/env python3
# -----------------------------------------------------------
#  Pull structured fields out of HT-cable FREE_REMARKS strings
# -----------------------------------------------------------
import re
import pandas as pd

# ─── Edit these two paths ───────────────────────────────────
SRC  = r'/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/Book1.csv'   # has FREE_REMARKS column
DEST = r'/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_ONLY_FREEREMARKS.csv'
# ─────────────────────────────────────────────────────────────

# 1) Load and normalise ----------------------------------------------------------
df_txt = (
    pd.read_csv(SRC, usecols=['FREE_REMARKS'], low_memory=False)
      .fillna('')
      .assign(TXT=lambda d: d['FREE_REMARKS'].str.upper())
)

# 2) Pre-compile the regexes -----------------------------------------------------
RE_SIZE        = re.compile(r'\bX\s*([\d+\-.\s]*SQ\.?\s*MM)')            # 120SQ.MM  0.15+185+240SQ.MM
RE_INSULATION  = re.compile(r'(PILC\+XLPE|XLPE\+PILC|PILC|XLPE)')
RE_VOLTAGE     = re.compile(r'(\d{2,3})\s*KV')
RE_TYPE        = re.compile(r'(HTCF SECTION|LT SECTION|HT SECTION)')
RE_SECTION     = re.compile(
    r'BETWEEN\s+(.+?)\s+S/S\s+SW\.?ITCH:?\s*([0-9]+)\s+TO\s+(.+?)\s+S/S\s+SW\.?ITCH:?\s*([0-9]+)',
    flags=re.I
)
RE_DELAYED     = re.compile(
    r'(?:ISOLATION\s*&?\s*RESTORATION\s+)?DELAYED (?:AS|DUE TO)\s*(.*?)(?:NOTIFICATION NO|$)',
    flags=re.I
)

FAULT_TOKENS = [
    'CABLE DAMAGED BY', 'CABLE DAMAGED',     # covers slightly shorter variant
    'DMS OFFLINE AT', 'WENT OFFLINE',
    'FPI MALFUNCTION', 'FPI FAULTY',
    'FAILED TO OPEN',  'FAILED TO CLOSE',
    'FEEDER TRIPPING', 'TRAFFIC ON',
    'SUPPLY RESTORTION DELAYED', 'SUPPLY RESTORATION DELAYED'
]
# create one big alternation for speed
RE_FAULT = re.compile(
    '(' + '|'.join(re.escape(tok) for tok in FAULT_TOKENS) + r'.*?)(?:\.|,|;|NOTIFICATION NO|$)',
    flags=re.I
)

# 3) Utility ---------------------------------------------------------------------
def _first(regex, text, fmt=lambda m: m.group(1)):
    m = regex.search(text)
    return fmt(m) if m else ''

def _faults(text: str) -> str:
    return '; '.join(m.group(1).strip() for m in RE_FAULT.finditer(text))

# 4) Column extraction -----------------------------------------------------------
out = pd.DataFrame({
    'Size'       : df_txt['TXT'].apply(lambda t: _first(RE_SIZE, t, lambda m: m.group(1).replace(' ', ''))),
    'Insulation' : df_txt['TXT'].apply(lambda t: _first(RE_INSULATION, t)),
    'Voltage'    : df_txt['TXT'].apply(lambda t: _first(RE_VOLTAGE, t, lambda m: f"{m.group(1)}KV")),
    'Type'       : df_txt['TXT'].apply(lambda t: _first(RE_TYPE, t))
})

# --- FROM / TO / SWITCH columns -------------------------------------------------
sec = df_txt['TXT'].str.extract(RE_SECTION)
sec.columns = ['FROM', 'FROM_SWITCH', 'TO', 'TO_SWITCH']
out = pd.concat([out, sec], axis=1)

# --- delay / fault --------------------------------------------------------------
out['DELAYED_REASON'] = df_txt['TXT'].apply(lambda t: _first(RE_DELAYED, t).strip())
out['FAULT_NATURE']   = df_txt['TXT'].apply(_faults)

# 5) Final tidy-up & write -------------------------------------------------------
out = out[['Size','Insulation','Voltage','Type',
           'FROM','FROM_SWITCH','TO','TO_SWITCH',
           'DELAYED_REASON','FAULT_NATURE']]

out.to_csv(DEST, index=False)
print(f'✓ Parsed {len(out):,} records  →  {DEST}')
print(out.head(10).to_markdown())


✓ Parsed 24,228 records  →  /media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/processed_fault_data_ONLY_FREEREMARKS.csv
|    | Size                      | Insulation   | Voltage   | Type         | FROM                     |   FROM_SWITCH | TO                     |   TO_SWITCH | DELAYED_REASON                                                                                                                                                                                                                                                                                                                                                                    | FAULT_NATURE   |
|---:|:--------------------------|:-------------|:----------|:-------------|:-------------------------|--------------:|:-----------------------|------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------

ANALYZE THE DESCRIPTION COLUMN 

In [1]:
# Count the number of rows where the DESCRIPTION column contains the word "SWITCH"
import pandas as pd
df = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/DESCRIPTION.csv', low_memory=False)
rows_with_switch = df['DESCRIPTION'].str.contains(r'\bSWITCH\b', case=False, na=False).sum()
print(f'Number of rows containing the word "SWITCH": {rows_with_switch}')

Number of rows containing the word "SWITCH": 21584


In [2]:
len(df)  # Total number of rows in the DataFrame

23939

In [8]:

rows_with_switch = df['DESCRIPTION'].str.contains(r'\b11 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "11kV": {rows_with_switch}')

Number of rows containing the word "11kV": 19598


In [9]:


rows_with_switch_22 = df['DESCRIPTION'].str.contains(r'\b22 KV\b', case=False, na=False).sum()
row_with_33 = df['DESCRIPTION'].str.contains(r'\b33 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "22kV": {rows_with_switch_22}')
print(f'Number of rows containing the word "33kV": {row_with_33}')


Number of rows containing the word "22kV": 627
Number of rows containing the word "33kV": 1194


In [10]:
total = rows_with_switch + row_with_33 + rows_with_switch_22
print(f'Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": {total}')

Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": 21419


In [12]:
# Count the number of rows where the DESCRIPTION column contains the word "SWITCH"
import pandas as pd
df = pd.read_csv('/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/TXN_NMS_HTLOGSHEET_FREE_REMARKS_ONLY_clean.csv', low_memory=False)
rows_with_switch = df['FREE_REMARKS'].str.contains(r'\bSWITCH\b', case=False, na=False).sum()
print(f'Number of rows containing the word "SWITCH": {rows_with_switch}')

Number of rows containing the word "SWITCH": 21749


In [14]:

rows_with_switch = df['FREE_REMARKS'].str.contains(r'\b11 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "11kV": {rows_with_switch}')


rows_with_switch_22 = df['FREE_REMARKS'].str.contains(r'\b22 KV\b', case=False, na=False).sum()
row_with_33 = df['FREE_REMARKS'].str.contains(r'\b33 KV\b', case=False, na=False).sum()
print(f'Number of rows containing the word "22kV": {rows_with_switch_22}')
print(f'Number of rows containing the word "33kV": {row_with_33}')


Number of rows containing the word "11kV": 19777
Number of rows containing the word "22kV": 680
Number of rows containing the word "33kV": 1294


In [16]:
total = rows_with_switch + row_with_33 + rows_with_switch_22
print(f'Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": {total}')

Total number of rows containing "SWITCH", "11kV", "22kV", or "33kV": 21751


MAtching the feeder id with thE cleaned  DATA OF FAULT collmn SWITCH_NO

In [35]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv"
file1_column_name = "SWITCH_NO"

# File 2 Details
file2_path = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT.csv"
file2_column_name = "FEEDER_ID"



# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name):
    """
    Reads a file, extracts unique values from a specific column,
    cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip')
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip')
        
        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')
        
        cleaned_values = set(s.dropna().astype(int))
        
        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name)
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name)

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)
    
    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)
 

Processing file: /media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv...
  - Found 1268 unique, clean values.
Processing file: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT.csv...
  - Found 945 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagarkumar/New Volume/SAGAR/HTLOGSHEET_CLEAN_enetrytype_1_cleaned.csv' (Column: SWITCH_NO): 1268
Unique values in '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT.csv' (Column: FEEDER_ID): 945
-------------------------
Number of values that matched: 824
-------------------------


In [32]:
matching_values_list = sorted(matching_values)
print("Matching values (sorted):")
output_file_path = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/order_id.csv"
matching_values_df = pd.DataFrame(matching_values_list, columns=[file1_column_name])
matching_values_df.to_csv(output_file_path, index=False)
print(f"Matching values saved to: {output_file_path}")

Matching values (sorted):
Matching values saved to: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/order_id.csv


ORDER_ID AND SWITCH NO IN ORDER ID MATCH WITH SWITCH NO IS 89 BUT ORDER_ID MATCH THE FEDER _ID WITH 99

In [61]:
match = unique_values_from_file1.intersection(matching_values_list)
print(f"Number of values that matched: {len(match)}")


print(f"Total matching values: {len(matching_values_list)}")
match_list = sorted(match)

Number of values that matched: 89
Total matching values: 99


In [57]:
# Find values in match that are NOT in matching_values_list
diff_match_vs_matching_values_list = sorted(set(matching_values_list) - set(match))
print("Values in 'match' but not in 'matching_values_list':", diff_match_vs_matching_values_list)
print(f"Count: {len(diff_match_vs_matching_values_list)}")

Values in 'match' but not in 'matching_values_list': [26332, 28192, 29986, 30467, 30668, 32164, 32529, 32530, 34678, 39411]
Count: 10


In [None]:
import re
import pandas as pd

# Load your data
df = pd.read_csv('your_input_file.csv', low_memory=False)

# Fallback for missing values
df['FREE_REMARKS'] = df['FREE_REMARKS'].fillna('').astype(str)

# Regex patterns
RE_BETWEEN = re.compile(
    r'''BETWEEN\s+
        (?P<src>.+?)\s+S/S\s+(?:SWITCH|SW\.?\s*NO\.?)\s*[:\s]*(?P<src_sw>\d+)
        \s+TO\s+
        (?P<dst>.+?)\s+S/S\s+(?:SWITCH|SW\.?\s*NO\.?)\s*[:\s]*(?P<dst_sw>\d+)
    ''', flags=re.I|re.X)

RE_VOLTAGE = re.compile(r'(\d{2,3})\s*KV', re.I)

def extract_info(row):
    txt = row['FREE_REMARKS'].upper()
    m = RE_BETWEEN.search(txt)
    # Default empty
    src, src_sw, dst, dst_sw = '', '', '', ''
    if m:
        src, src_sw, dst, dst_sw = (m.group('src').strip(), m.group('src_sw').strip(),
                                    m.group('dst').strip(), m.group('dst_sw').strip())
    # Voltage
    v_match = RE_VOLTAGE.search(txt)
    voltage = v_match.group(1) + ' KV' if v_match else ''
    return pd.Series({
        'SOURCE_STATION': src,
        'FROM_SWITCH': src_sw,
        'DESTINATION_STATION': dst,
        'TO_SWITCH': dst_sw,
        'CABLE_VOLTAGE': voltage
    })

# Apply to DataFrame (fast, vectorized)
extracted = df.apply(extract_info, axis=1)

# Combine with your needed columns
result = pd.concat([
    df[['SWITCH_NO', 'STATION_NAME', 'TIME_OUTAGE', 'TIME_RESTORED', 'MAIN_REPORTED_TIME']],
    extracted
], axis=1)

# Time difference in hours (assuming both columns are in standard format)
result['TIME_OUTAGE'] = pd.to_datetime(result['TIME_OUTAGE'], errors='coerce')
result['TIME_RESTORED'] = pd.to_datetime(result['TIME_RESTORED'], errors='coerce')
result['MAIN_REPORTED_TIME'] = pd.to_datetime(result['MAIN_REPORTED_TIME'], errors='coerce')

result['OUTAGE_DURATION_HRS'] = (
    (result['TIME_RESTORED'] - result['MAIN_REPORTED_TIME'])
    .dt.total_seconds() / 3600
).round(2)

# Save output
result.to_csv('analyzed_HTcable_faults.csv', index=False)
print(result.head(12))
