In [7]:
import pandas as pd

# Load the Excel file
file_path = r"D:\data\harbourline 30-60min TYBSC.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)

# Debug: Display the first few rows of the input data
print("Initial data preview:")
print(df.head())

# Ensure Pincode column is treated as integers
if df['PINCODE'].dtype != 'int64':  # Convert to integers if needed
    df['PINCODE'] = pd.to_numeric(df['PINCODE'], errors='coerce', downcast='integer')

# Drop duplicates in the dataset
df = df.drop_duplicates()
print(f"Dataset rows after dropping duplicates: {df.shape[0]}")

# Define the station-pincode mapping with the new data
harbour_station_mapping = [
    {"STATION": "Nerul, Seawood Darave", "PINCODE": 400706},
    {"STATION": "Kharghar", "PINCODE": 410210},
    {"STATION": "Panvel, Ulwae, Khandeshwar", "PINCODE": 410206},
    {"STATION": "Koperkhairane", "PINCODE": 400709},
    {"STATION": "Ghansoli", "PINCODE": 400701},
    {"STATION": "Airoli", "PINCODE": 400708},
]

# Update the list of pincodes for Harbour Line
harbour_line_pincodes = [
    400706, 410210, 410206, 400709, 400701, 400708
]

# Create a DataFrame from the updated mapping
mapping_df = pd.DataFrame(harbour_station_mapping)

# Filter the data to include only the updated pincodes
filtered_df = df[df['PINCODE'].isin(harbour_line_pincodes)]

# Debug: Display the filtered data
print("Filtered data preview:")
print(filtered_df.head())
print(f"Rows after filtering by pincodes: {filtered_df.shape[0]}")

# Merge the mapping with the filtered data
merged_df = pd.merge(filtered_df, mapping_df, on="PINCODE", how="left")

# Debug: Check for unmapped rows
unmapped_rows = merged_df[merged_df['STATION'].isnull()]
if not unmapped_rows.empty:
    print("Unmapped rows detected:")
    print(unmapped_rows)

# Group by Station and Pincode to calculate counts
result_df = merged_df.groupby(["STATION", "PINCODE"]).agg(
    COUNT=('PINCODE', 'size'),
    DETAILS=('STATION', 'first')  # To preserve the station name if needed
).reset_index()

# Enforce the custom pincode order
custom_order = {pincode: idx for idx, pincode in enumerate(harbour_line_pincodes)}
result_df["ORDER"] = result_df["PINCODE"].map(custom_order)
result_df = result_df.sort_values(by="ORDER").drop(columns=["ORDER"])

# Save the resulting DataFrame to an Excel file
output_file = r"D:\data\updated_harbour_station_pincode_counts.xlsx"
result_df.to_excel(output_file, index=False, engine="openpyxl")

# Validate counts match
original_row_count = df.shape[0]
filtered_row_count = filtered_df.shape[0]
output_row_count = result_df["COUNT"].sum()

print(f"Original sheet rows: {original_row_count}")
print(f"Filtered sheet rows: {filtered_row_count}")
print(f"Processed sheet rows: {output_row_count}")

if original_row_count == filtered_row_count:
    print("Filtered rows match the original dataset.")
else:
    print("Mismatch between original and filtered rows. Please review the data or the filtering logic.")

if filtered_row_count == output_row_count:
    print("Counts match! Data processed correctly.")
else:
    print("Counts do not match. Please review the data or the mapping.")

print(f"Filtered and arranged Harbour Line data saved to: {output_file}")


Initial data preview:
                                      INSTITUTE NAME           STUDENT NAME  \
0  Anjuman-I-Islams Institute of Hospitality Mana...  DALVI RUNALI PRASHANT   
1  Anjuman-I-Islams Institute of Hospitality Mana...   KHAN FAHIM MOHD AYUB   
2      Apeejay Institute of Hospitality, Navi Mumbai   SAHI HRITIK INDERJIT   
3      Apeejay Institute of Hospitality, Navi Mumbai   SONAWANE ROHIT NITIN   
4                      B. P. Marine Academy, Belapur     ADITYA DILIP PATIL   

   GENDER                                            ADDRESS  PINCODE  \
0  Female  Sai Srushti Apt Room No 201,Plot No 271 Nerul ...   400706   
1    Male  Plot No B32 Shop No6,7 Ekta CHS Seawood Sector 23   400706   
2    Male             B-3/402  Madhavi CHS Nerul Navi Mumbai   400706   
3    Male  A-11 GANGOTRI ROOM NO 1/17 SECTOR 18  NERUL SE...   400706   
4    Male  ROOM NO:303 OM SADGURU CHS A-10 SEC 24 NERUL W...   400706   

       MOBILE                         EMAIL                     