In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import duckdb

# os.chdir(r"C:\Users\siddu\Desktop\Decision Science Track\Revision")
print(os.getcwd())

In [None]:
# Upload filepaths
train_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\train_data_merged.parquet"
test_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\test_data_merged.parquet"

# Load files into dataframes
train_data_df = pd.read_parquet(train_data_fp)
test_data_df = pd.read_parquet(test_data_fp)

In [None]:
# Check for number of rows and columns
print(train_data_df.shape)
print(test_data_df.shape)

In [None]:
# Check consistency of datatypes of train and test
from collections import defaultdict

# --- Assuming your DataFrames are loaded ---
# train_data_df = ...
# test_data_df = ...


## 1. Group DataFrames for easy checking
# Use a dictionary to map a name to the DataFrame object
dataframes_to_check = {
    'train': train_data_df,
    'test': test_data_df
}

## 2. NEW: Automatically build the list of ALL unique columns
print("Finding all unique columns across datasets...")
all_columns_set = set()
for df in dataframes_to_check.values():
    # .update() adds all columns from the DataFrame to the set
    all_columns_set.update(df.columns)

# Use this comprehensive list as your new cols_to_check
# Sorting makes the final report easier to read
cols_to_check = sorted(list(all_columns_set))
print(f"Found {len(cols_to_check)} unique columns to check.")


## 3. Store findings in a dictionary
# The key will be the column name
# The value will be a set of all dtypes found
column_dtypes = defaultdict(set)

## 4. Iterate and check
print("\nChecking column data types...")
for col in cols_to_check:
    for df_name, df in dataframes_to_check.items():
        # Check if the column exists in the DataFrame
        if col in df.columns:
            # If it exists, add its dtype (as a string) to the set
            column_dtypes[col].add(str(df[col].dtype))

## 5. Report the results
print("\n--- Dtype Consistency Report ---")
inconsistent_cols = []

for col, dtypes in column_dtypes.items():
    # Note: A column will ONLY show up here if it exists
    # in at least one of the DataFrames.
    
    if len(dtypes) == 1:
        # Only one dtype was found, so it's consistent
        print(f"✅ {col:<20} | Consistent: {list(dtypes)[0]}")
    else:
        # More than one dtype was found, this is an inconsistency
        print(f"⚠️ {col:<20} | INCONSISTENT: {dtypes}")
        inconsistent_cols.append(col)

print("\n--- Summary ---")
if not inconsistent_cols:
    print("All checked columns are consistent across datasets.")
else:
    print(f"Found {len(inconsistent_cols)} inconsistent columns: {inconsistent_cols}")

In [None]:
print(train_data_df.dtypes)
print(test_data_df.dtypes)

In [None]:
print(train_data_df.info())
print(test_data_df.info())

In [None]:
import pandas as pd
import numpy as np

def profile_object_columns(df, df_name):
    """
    Profiles all 'object' columns in a DataFrame for common data quality issues
    and returns a DataFrame of the findings.
    """
    print(f"\n--- Profiling 'object' columns in {df_name} ---")
    
    object_cols = df.select_dtypes(include=['object']).columns
    
    if len(object_cols) == 0:
        print("No 'object' columns found to profile.")
        # Return an empty DataFrame if no object columns
        return pd.DataFrame() 

    print(f"Found {len(object_cols)} 'object' columns to check.")
    
    null_like_values = ['null', 'Null', 'NULL', 'nan', 'NaN', 'NA', 'N/A', 'None', '']

    cleanliness_report = []
    
    for col in object_cols:
        series_as_str = df[col].astype(str)
        
        # Check 1: Whitespace
        whitespace_count = (series_as_str != series_as_str.str.strip()).sum()
        
        # Check 2: Null-like strings
        null_like_count = df[col].isin(null_like_values).sum()
        
        # Check 3: Case Inconsistency
        nunique = series_as_str.nunique()
        nunique_lower = series_as_str.str.lower().nunique()
        case_issue = (nunique != nunique_lower)
        
        # Check 4: Real NaN count
        real_nan_count = df[col].isnull().sum()
        
        cleanliness_report.append({
            'column': col,
            'real_nan_count': real_nan_count,
            'whitespace_count': whitespace_count,
            'null_like_strings_count': null_like_count,
            'has_case_inconsistency': case_issue
        })

    report_df = pd.DataFrame(cleanliness_report)
    
    # Filter to only show columns with issues
    dirty_cols_report = report_df[
        (report_df['real_nan_count'] > 0) |
        (report_df['whitespace_count'] > 0) |
        (report_df['null_like_strings_count'] > 0) |
        (report_df['has_case_inconsistency'] == True)
    ]
    
    print("\n--- Data Cleanliness Report ---")
    if dirty_cols_report.empty:
        print("✅ All 'object' columns look clean.")
    else:
        print("⚠️ Found potential issues in the following 'object' columns:")
        print(dirty_cols_report.to_string(index=False))
        
    # --- MODIFICATION HERE ---
    # Return the report of dirty columns so we can save it
    return dirty_cols_report

# --- How to use it ---

# Assuming train_data_df and test_data_df are loaded

# 1. Run the profiler and *capture the returned DataFrames*
print("Running profiler on training data...")
train_report = profile_object_columns(train_data_df, 'train_data_df')

print("\nRunning profiler on test data...")
test_report = profile_object_columns(test_data_df, 'test_data_df')

# 2. Define an output Excel file
output_excel_file = 'data_cleanliness_report.xlsx'

# 3. Use pd.ExcelWriter to save both reports to different sheets
print(f"\nExporting reports to {output_excel_file}...")
with pd.ExcelWriter(output_excel_file, engine='openpyxl') as writer:
    train_report.to_excel(writer, sheet_name='Train Report', index=False)
    test_report.to_excel(writer, sheet_name='Test Report', index=False)

print("✅ Successfully exported reports.")

In [None]:
import pandas as pd
import numpy as np

# The two columns we know have case issues
cols_to_fix = ['id8', 'id11']

# Your dictionary of dataframes
dataframes_to_check = {
    'train': train_data_df,
    'test': test_data_df
}

print("--- Applying Robust Normalization for 'id8', 'id11' ---")

for df_name, df in dataframes_to_check.items():
    print(f"Checking DataFrame: '{df_name}'.")
    for col in cols_to_fix:
        if col in df.columns:
            # This is the robust chain that handles NaNs and mixed types
            # by converting to string *before* normalizing.
            df[col] = df[col].astype(str).str.strip().str.lower()
            print(f"- Column '{col}' robustly normalized.")

print("--- Robust Normalization Complete ---")

In [None]:
import pandas as pd
import numpy as np

print("--- Verifying fix for 'id8' and 'id11' ---")

cols_to_verify = ['id8', 'id11']
dataframes_to_check = {
    'train_data_df': train_data_df,
    'test_data_df': test_data_df
}

all_fixed = True

for df_name, df in dataframes_to_check.items():
    print(f"\nChecking {df_name}...")
    for col in cols_to_verify:
        if col in df.columns:
            # Run the *specific* check for case inconsistency
            series_as_str = df[col].astype(str)
            nunique = series_as_str.nunique()
            nunique_lower = series_as_str.str.lower().nunique()
            case_issue = (nunique != nunique_lower)
            
            if case_issue:
                print(f"⚠️ {col:<10} | FAILED: Still has case inconsistency.")
                all_fixed = False
            else:
                print(f"✅ {col:<10} | PASSED: Case is now consistent.")
        else:
            print(f"ℹ️ {col:<10} | Not found in this DataFrame.")
            
if all_fixed:
    print("\n--- Verification Successful ---")
else:
    print("\n--- Verification Failed ---")

In [None]:
import pandas as pd
import numpy as np

def profile_object_columns(df, df_name):
    """
    Profiles all 'object' columns in a DataFrame for common data quality issues
    and returns a DataFrame of the findings.
    """
    print(f"\n--- Profiling 'object' columns in {df_name} ---")
    
    object_cols = df.select_dtypes(include=['object']).columns
    
    if len(object_cols) == 0:
        print("No 'object' columns found to profile.")
        # Return an empty DataFrame if no object columns
        return pd.DataFrame() 

    print(f"Found {len(object_cols)} 'object' columns to check.")
    
    null_like_values = ['null', 'Null', 'NULL', 'nan', 'NaN', 'NA', 'N/A', 'None', '']

    cleanliness_report = []
    
    for col in object_cols:
        series_as_str = df[col].astype(str)
        
        # Check 1: Whitespace
        whitespace_count = (series_as_str != series_as_str.str.strip()).sum()
        
        # Check 2: Null-like strings
        null_like_count = df[col].isin(null_like_values).sum()
        
        # Check 3: Case Inconsistency
        nunique = series_as_str.nunique()
        nunique_lower = series_as_str.str.lower().nunique()
        case_issue = (nunique != nunique_lower)
        
        # Check 4: Real NaN count
        real_nan_count = df[col].isnull().sum()
        
        cleanliness_report.append({
            'column': col,
            'real_nan_count': real_nan_count,
            'whitespace_count': whitespace_count,
            'null_like_strings_count': null_like_count,
            'has_case_inconsistency': case_issue
        })

    report_df = pd.DataFrame(cleanliness_report)
    
    # Filter to only show columns with issues
    dirty_cols_report = report_df[
        (report_df['real_nan_count'] > 0) |
        (report_df['whitespace_count'] > 0) |
        (report_df['null_like_strings_count'] > 0) |
        (report_df['has_case_inconsistency'] == True)
    ]
    
    print("\n--- Data Cleanliness Report ---")
    if dirty_cols_report.empty:
        print("✅ All 'object' columns look clean.")
    else:
        print("⚠️ Found potential issues in the following 'object' columns:")
        print(dirty_cols_report.to_string(index=False))
        
    # --- MODIFICATION HERE ---
    # Return the report of dirty columns so we can save it
    return dirty_cols_report

# --- How to use it ---

# Assuming train_data_df and test_data_df are loaded

# 1. Run the profiler and *capture the returned DataFrames*
print("Running profiler on training data...")
train_report = profile_object_columns(train_data_df, 'train_data_df')

print("\nRunning profiler on test data...")
test_report = profile_object_columns(test_data_df, 'test_data_df')

# 2. Define an output Excel file
output_excel_file = 'data_cleanliness_report_1.xlsx'

# 3. Use pd.ExcelWriter to save both reports to different sheets
print(f"\nExporting reports to {output_excel_file}...")
with pd.ExcelWriter(output_excel_file, engine='openpyxl') as writer:
    train_report.to_excel(writer, sheet_name='Train Report', index=False)
    test_report.to_excel(writer, sheet_name='Test Report', index=False)

print("✅ Successfully exported reports.")

- Done normalization for all 381(380) object columns in train(test) datasets.
- Now check the float ones

In [None]:
import pandas as pd
import numpy as np

# --- Assuming train_data_df and test_data_df are loaded ---

print("--- Float Column Analysis ---")

# 1. Automatically find the float64 columns
float_cols_train = train_data_df.select_dtypes(include=['float64']).columns
float_cols_test = test_data_df.select_dtypes(include=['float64']).columns

print(f"Train float columns found: {list(float_cols_train)}")
print(f"Test float columns found: {list(float_cols_test)}")

# 2. Check the columns in train_data_df
print("\n--- Train Data Check ---")
if len(float_cols_train) > 0:
    # A) Show statistical distribution (mean, min, max, quartiles)
    print("Distribution Stats:")
    # Using .to_string() for better formatting
    print(train_data_df[float_cols_train].describe().to_string())

    # B) Check for Missing (NaN) values
    print("\nMissing (NaN) Value Counts:")
    print(train_data_df[float_cols_train].isnull().sum())

    # C) Check for Infinite (inf) values (common issue)
    print("\nInfinite Value Counts:")
    # We use np.isinf and check the sum
    print(np.isinf(train_data_df[float_cols_train]).sum())
else:
    print("No float columns found.")


# 3. Check the columns in test_data_df
print("\n--- Test Data Check ---")
if len(float_cols_test) > 0:
    # A) Show statistical distribution
    print("Distribution Stats:")
    print(test_data_df[float_cols_test].describe().to_string())

    # B) Check for Missing (NaN) values
    print("\nMissing (NaN) Value Counts:")
    print(test_data_df[float_cols_test].isnull().sum())
    
    # C) Check for Infinite (inf) values
    print("\nInfinite Value Counts:")
    print(np.isinf(test_data_df[float_cols_test]).sum())
else:
    print("No float columns found.")

# No mistakes till here

# Convert columns to their intended formats

In [None]:
# Column list generator function
def generate_column_list(start, end):
    # Extract the prefix (non-digit) part
    prefix = ''.join([c for c in start if not c.isdigit()])
    # Extract the numeric part and convert to int
    start_num = int(''.join([c for c in start if c.isdigit()]))
    end_num = int(''.join([c for c in end if c.isdigit()]))

    # Generate the list using list comprehension
    return [f"{prefix}{i}" for i in range(start_num, end_num + 1)]

# Example usage:
binary_cols = generate_column_list('f52', 'f57')
print(binary_cols)


In [None]:
# Specify col types
binary_cols = ['f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309']

numeric_cols = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f49', 'f50', 'f51','f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f350', 'f351', 'f352', 'f353', 'f355', 'f356', 'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366', 'f375', 'f376', 'f377']

categorical_cols = ['id3', 'id6', 'id11', 'f42', 'f48', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f349', 'f354', 'f374', 'f378']

datetime_cols = ['id4', 'id7', 'id12', 'id13', 'id5']

object_cols = ['id1', 'id2', 'id9']


In [None]:
print(train_data_df.info())
print(test_data_df.info())

In [None]:
import pandas as pd

# Assuming your DataFrame is loaded into a variable named 'train_data_df'
# Example (uncomment and use if you need to load it):
# train_data_df = pd.read_csv('your_file_name.csv')

# Find columns where all values are NaN
all_nan_cols = train_data_df.columns[train_data_df.isnull().all()]

# Get the count of such columns
num_all_nan_cols = len(all_nan_cols)

# Print the results
if num_all_nan_cols > 0:
    print(f"Found {num_all_nan_cols} columns with only NaN values.")
    print("These columns are:")
    
    # Print the list of column names
    for col_name in all_nan_cols:
        print(col_name)
else:
    print("No columns were found with only NaN values.")

In [None]:
# Define your output filepaths
train_output_fp = 'train_data_pre.parquet'
test_output_fp = 'test_data_pre.parquet'

print(f"Saving merged training data to {train_output_fp}...")
# Use .to_parquet() to save
# index=False is important to avoid saving the pandas index as a separate column
train_data_df.to_parquet(train_output_fp, index=False)

print(f"Saving merged test data to {test_output_fp}...")
test_data_df.to_parquet(test_output_fp, index=False)

print("Save complete.")

# Check

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import duckdb

# os.chdir(r"C:\Users\siddu\Desktop\Decision Science Track\Revision")
print(os.getcwd())

In [None]:
# Upload filepaths
train_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\train_data_pre.parquet"
test_data_fp = r"C:\Users\siddu\Desktop\Decision Science Track\Revision\test_data_pre.parquet"

# Load files into dataframes
train_data_df = pd.read_parquet(train_data_fp)
test_data_df = pd.read_parquet(test_data_fp)

In [None]:
# Find columns where all values are NaN
all_nan_cols = train_data_df.columns[train_data_df.isnull().all()]

# Get the count of such columns
num_all_nan_cols = len(all_nan_cols)

# Print the results
if num_all_nan_cols > 0:
    print(f"Found {num_all_nan_cols} columns with only NaN values.")
    print("These columns are:")
    
    # Print the list of column names
    for col_name in all_nan_cols:
        print(col_name)
else:
    print("No columns were found with only NaN values.")

In [None]:
# Specify col types
binary_cols = ['y', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299', 'f300', 'f301', 'f302', 'f303', 'f304', 'f305', 'f306', 'f307', 'f308', 'f309']

numeric_cols = ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51','f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318', 'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327', 'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336', 'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345', 'f346', 'f347', 'f348', 'f350', 'f351', 'f352', 'f353', 'f355', 'f356', 'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365', 'f366', 'f375', 'f376', 'f377']

categorical_cols = ['id3', 'id6','id8', 'id10', 'id11', 'f42', 'f48', 'f50', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f349', 'f354', 'f374', 'f378']

datetime_cols = ['id4', 'id7', 'id12', 'id13', 'id5']

object_cols = ['id1', 'id2', 'id9']


In [None]:
# Processing all data types

def process_all_data_types(df):
    """
    Processes and converts all columns in a DataFrame based on predefined lists.
    """
    print("Converting binary columns (float32)...")
    # This map is robust: handles '0'/'1' and 0/1.
    # Anything else (like NaN, None, '') becomes <NA>
    #binary_map = {'0': 0, '1': 1, 0: 0, 1: 1}
    for col in binary_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int8')

    print("Converting numeric columns (float32)...")
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('float32')

    print("Converting datetime columns (datetime64[ns])...")
    for col in datetime_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    print("Converting categorical columns (category)...")
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    print("Cleaning and converting key object columns (string)...")
    for col in object_cols:
        if col in df.columns:
            # Clean and convert to modern 'string' dtype
            df[col] = df[col].astype(str).str.strip().str.lower().astype('string')
            
    return df

# --- Run Processing on Both DataFrames ---
dataframes_to_process = {
    'train': train_data_df,
    'test': test_data_df
}

# Keep track of all columns we intended to process
all_specified_cols = set(binary_cols) | set(numeric_cols) | set(categorical_cols) | set(datetime_cols) | set(object_cols)

for name, df in dataframes_to_process.items():
    print(f"\n--- Processing {name.upper()} DataFrame ---")
    
    # Run the main conversion function
    df = process_all_data_types(df)
    
    # --- Verification ---
    print(f"\n--- {name.upper()} DataFrame .info() after conversion ---")
    df.info() # This will show us the new dtypes
    
    # Find any columns that were in the DataFrame but not in our lists
    unprocessed_cols = set(df.columns) - all_specified_cols
    
    if unprocessed_cols:
        print(f"\n⚠️ WARNING: The following {len(unprocessed_cols)} columns were not in any list:")
        print(unprocessed_cols)
    else:
        print("\n✅ All columns were successfully processed.")

In [None]:
# Find columns where all values are NaN
all_nan_cols = train_data_df.columns[train_data_df.isnull().all()]

# Get the count of such columns
num_all_nan_cols = len(all_nan_cols)

# Print the results
if num_all_nan_cols > 0:
    print(f"Found {num_all_nan_cols} columns with only NaN values.")
    print("These columns are:")
    
    # Print the list of column names
    for col_name in all_nan_cols:
        print(col_name)
else:
    print("No columns were found with only NaN values.")

In [None]:
print(train_data_df.info())
print(test_data_df.info())

In [None]:
# Check consistency of datatypes of train and test
from collections import defaultdict

# --- Assuming your DataFrames are loaded ---
# train_data_df = ...
# test_data_df = ...


## 1. Group DataFrames for easy checking
# Use a dictionary to map a name to the DataFrame object
dataframes_to_check = {
    'train': train_data_df,
    'test': test_data_df
}

## 2. NEW: Automatically build the list of ALL unique columns
print("Finding all unique columns across datasets...")
all_columns_set = set()
for df in dataframes_to_check.values():
    # .update() adds all columns from the DataFrame to the set
    all_columns_set.update(df.columns)

# Use this comprehensive list as your new cols_to_check
# Sorting makes the final report easier to read
cols_to_check = sorted(list(all_columns_set))
print(f"Found {len(cols_to_check)} unique columns to check.")


## 3. Store findings in a dictionary
# The key will be the column name
# The value will be a set of all dtypes found
column_dtypes = defaultdict(set)

## 4. Iterate and check
print("\nChecking column data types...")
for col in cols_to_check:
    for df_name, df in dataframes_to_check.items():
        # Check if the column exists in the DataFrame
        if col in df.columns:
            # If it exists, add its dtype (as a string) to the set
            column_dtypes[col].add(str(df[col].dtype))

## 5. Report the results
print("\n--- Dtype Consistency Report ---")
inconsistent_cols = []

for col, dtypes in column_dtypes.items():
    # Note: A column will ONLY show up here if it exists
    # in at least one of the DataFrames.
    
    if len(dtypes) == 1:
        # Only one dtype was found, so it's consistent
        print(f"✅ {col:<20} | Consistent: {list(dtypes)[0]}")
    else:
        # More than one dtype was found, this is an inconsistency
        print(f"⚠️ {col:<20} | INCONSISTENT: {dtypes}")
        inconsistent_cols.append(col)

print("\n--- Summary ---")
if not inconsistent_cols:
    print("All checked columns are consistent across datasets.")
else:
    print(f"Found {len(inconsistent_cols)} inconsistent columns: {inconsistent_cols}")

In [None]:
import pandas as pd

# --- Assuming your merged DataFrames are in memory ---

# Define your output filepaths
train_output_fp = 'train_data_cols_sorted.parquet'
test_output_fp = 'test_data_cols_sorted.parquet'

print(f"Saving training data to {train_output_fp}...")
# Use .to_parquet() to save
# index=False is important to avoid saving the pandas index as a separate column
train_data_df.to_parquet(train_output_fp, index=False)

print(f"Saving test data to {test_output_fp}...")
test_data_df.to_parquet(test_output_fp, index=False)

print("Save complete.")