In [2]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/Users/vaish2205/Desktop/Dissertation/BasicCompanyDataAsOneFile-2025-06-01.csv')

# Function to explore the dataset
def explore_dataset(df):
    print("📊 Basic Dataset Information\n" + "-"*40)
    
    print(f"Shape of dataset: {df.shape[0]} rows, {df.shape[1]} columns\n")
    
    print("🧾 Column Names:")
    print(df.columns.tolist(), "\n")
    
    print("🧬 Data Types:")
    print(df.dtypes, "\n")
    
    print("🔍 First 5 Rows:")
    print(df.head(), "\n")
    
    print("🔚 Last 5 Rows:")
    print(df.tail(), "\n")
    
    print("📉 Summary Statistics:")
    print(df.describe(include='all'), "\n")
    
    print("❓ Missing Values:")
    print(df.isnull().sum(), "\n")
    
    print("🧩 Duplicate Rows Count:")
    print(df.duplicated().sum(), "\n")
    
    print("🧠 Dataset Info:")
    df.info()

# Call the function to explore your loaded dataset
explore_dataset(df)


  df = pd.read_csv('/Users/vaish2205/Desktop/Dissertation/BasicCompanyDataAsOneFile-2025-06-01.csv')


📊 Basic Dataset Information
----------------------------------------
Shape of dataset: 5656584 rows, 55 columns

🧾 Column Names:
['CompanyName', ' CompanyNumber', 'RegAddress.CareOf', 'RegAddress.POBox', 'RegAddress.AddressLine1', ' RegAddress.AddressLine2', 'RegAddress.PostTown', 'RegAddress.County', 'RegAddress.Country', 'RegAddress.PostCode', 'CompanyCategory', 'CompanyStatus', 'CountryOfOrigin', 'DissolutionDate', 'IncorporationDate', 'Accounts.AccountRefDay', 'Accounts.AccountRefMonth', 'Accounts.NextDueDate', 'Accounts.LastMadeUpDate', 'Accounts.AccountCategory', 'Returns.NextDueDate', 'Returns.LastMadeUpDate', 'Mortgages.NumMortCharges', 'Mortgages.NumMortOutstanding', 'Mortgages.NumMortPartSatisfied', 'Mortgages.NumMortSatisfied', 'SICCode.SicText_1', 'SICCode.SicText_2', 'SICCode.SicText_3', 'SICCode.SicText_4', 'LimitedPartnerships.NumGenPartners', 'LimitedPartnerships.NumLimPartners', 'URI', 'PreviousName_1.CONDATE', ' PreviousName_1.CompanyName', ' PreviousName_2.CONDATE', 

In [4]:
# Calculate percentage of missing values per column
missing_percent = df.isnull().mean() * 100

# Sort descending and print
missing_percent = missing_percent.sort_values(ascending=False)

print(missing_percent)


DissolutionDate                       100.000000
 PreviousName_10.CompanyName           99.999328
PreviousName_10.CONDATE                99.999328
 PreviousName_9.CompanyName            99.998922
PreviousName_9.CONDATE                 99.998922
 PreviousName_8.CompanyName            99.998197
PreviousName_8.CONDATE                 99.998197
PreviousName_7.CONDATE                 99.996676
 PreviousName_7.CompanyName            99.996676
PreviousName_6.CONDATE                 99.991921
 PreviousName_6.CompanyName            99.991921
 PreviousName_5.CompanyName            99.976187
PreviousName_5.CONDATE                 99.976187
PreviousName_4.CONDATE                 99.918131
 PreviousName_4.CompanyName            99.918131
PreviousName_3.CONDATE                 99.666389
 PreviousName_3.CompanyName            99.666389
RegAddress.CareOf                      99.463422
RegAddress.POBox                       99.380103
 PreviousName_2.CompanyName            98.411586
 PreviousName_2.COND

In [6]:
import pandas as pd

# Step 1: Clean column names
df.columns = df.columns.str.strip()

# Step 2: Drop columns with >80% missing values
threshold = 0.8
missing_fraction = df.isnull().mean()
cols_to_drop = missing_fraction[missing_fraction > threshold].index
df_clean = df.drop(columns=cols_to_drop)
print(f"🗑️ Dropped columns with >{threshold*100}% missing values:\n", list(cols_to_drop))

# Step 3: Drop known unnecessary columns (if still present)
if 'Returns.LastMadeUpDate' in df_clean.columns:
    df_clean.drop(columns=['Returns.LastMadeUpDate'], inplace=True)
    print("🗑️ Dropped 'Returns.LastMadeUpDate' due to >80% missing values.")

# Step 4: Fill address-related missing values with 'Unknown'
address_cols = [
    'RegAddress.AddressLine2',
    'RegAddress.County',
    'RegAddress.Country',
    'RegAddress.AddressLine1',
    'RegAddress.PostTown',
    'RegAddress.PostCode'
]
for col in address_cols:
    if col in df_clean.columns:
        df_clean[col].fillna('Unknown', inplace=True)

# Step 5: Convert date columns to datetime
date_cols = [
    'Returns.NextDueDate',
    'IncorporationDate',
    'Accounts.LastMadeUpDate',
    'ConfStmtLastMadeUpDate',
    'Accounts.NextDueDate',
    'ConfStmtNextDueDate'
]
for col in date_cols:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

# Step 6: Fill mode for categorical numeric fields
for col in ['Accounts.AccountRefDay', 'Accounts.AccountRefMonth']:
    if col in df_clean.columns:
        mode_val = df_clean[col].mode(dropna=True)
        if not mode_val.empty:
            df_clean[col].fillna(mode_val[0], inplace=True)

# Step 7: Fill numeric missing values with median
numeric_cols = df_clean.select_dtypes(include=['number']).columns
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        median_val = df_clean[col].median()
        df_clean[col].fillna(median_val, inplace=True)

# Step 8: Fill remaining specific date columns with median date
for col in ['ConfStmtLastMadeUpDate']:
    if col in df_clean.columns:
        median_date = df_clean[col].median()
        df_clean[col].fillna(median_date, inplace=True)

# Step 9: Drop rows with missing values in key business columns
cols_to_check = [
    'Returns.NextDueDate',
    'IncorporationDate',
    'RegAddress.AddressLine2',
    'Accounts.LastMadeUpDate',
    'ConfStmtLastMadeUpDate',
    'Accounts.NextDueDate',
    'ConfStmtNextDueDate'
]
cols_to_check = [col for col in cols_to_check if col in df_clean.columns]
df_clean.dropna(subset=cols_to_check, inplace=True)
print(f"🧹 Dropped rows with missing values in critical columns: {cols_to_check}")


🗑️ Dropped columns with >80.0% missing values:
 ['RegAddress.CareOf', 'RegAddress.POBox', 'DissolutionDate', 'SICCode.SicText_2', 'SICCode.SicText_3', 'SICCode.SicText_4', 'PreviousName_1.CONDATE', 'PreviousName_1.CompanyName', 'PreviousName_2.CONDATE', 'PreviousName_2.CompanyName', 'PreviousName_3.CONDATE', 'PreviousName_3.CompanyName', 'PreviousName_4.CONDATE', 'PreviousName_4.CompanyName', 'PreviousName_5.CONDATE', 'PreviousName_5.CompanyName', 'PreviousName_6.CONDATE', 'PreviousName_6.CompanyName', 'PreviousName_7.CONDATE', 'PreviousName_7.CompanyName', 'PreviousName_8.CONDATE', 'PreviousName_8.CompanyName', 'PreviousName_9.CONDATE', 'PreviousName_9.CompanyName', 'PreviousName_10.CONDATE', 'PreviousName_10.CompanyName']
🗑️ Dropped 'Returns.LastMadeUpDate' due to >80% missing values.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[col].fillna('Unknown', inplace=True)
  df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
  df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
  df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original obj

🧹 Dropped rows with missing values in critical columns: ['Returns.NextDueDate', 'IncorporationDate', 'RegAddress.AddressLine2', 'Accounts.LastMadeUpDate', 'ConfStmtLastMadeUpDate', 'Accounts.NextDueDate', 'ConfStmtNextDueDate']


In [8]:
# Check if any missing values exist
any_missing = df_clean.isnull().values.any()
print("Are there any missing values left?", any_missing)

# Show detailed missing values per column (only if any exist)
if any_missing:
    missing_percent = df_clean.isnull().mean() * 100
    print("\nMissing values percentage per column:")
    print(missing_percent[missing_percent > 0].sort_values(ascending=False))
else:
    print("✅ All missing values have been handled.")

# Save the cleaned DataFrame to a CSV file
df_clean.to_csv("Cleaned House data.csv", index=False)
print("Cleaned data saved as 'Cleaned House data.csv'")



Are there any missing values left? False
✅ All missing values have been handled.
Cleaned data saved as 'Cleaned House data.csv'


In [10]:
import pandas as pd

# Load the cleaned house data
df = pd.read_csv("Cleaned House data.csv")

# Show dataset shape (rows, columns)
print(f"Dataset shape: {df.shape}\n")

# Show info about columns and data types
print("Data types and non-null counts:")
print(df.info(), "\n")

# Calculate missing values count and percentage per column
missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100

missing_data = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing %': missing_percent
})

print("Missing values per column (if any):")
print(missing_data[missing_data['Missing Count'] > 0].sort_values(by='Missing %', ascending=False), "\n")

# Show basic statistics summary for numeric columns
print("Basic statistics summary:")
print(df.describe(), "\n")

# Preview first 5 rows of data
print("First 5 rows of the dataset:")
print(df.head())


Dataset shape: (1265591, 28)

Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1265591 entries, 0 to 1265590
Data columns (total 28 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   CompanyName                         1265591 non-null  object 
 1   CompanyNumber                       1265591 non-null  object 
 2   RegAddress.AddressLine1             1265591 non-null  object 
 3   RegAddress.AddressLine2             1265591 non-null  object 
 4   RegAddress.PostTown                 1265591 non-null  object 
 5   RegAddress.County                   1265591 non-null  object 
 6   RegAddress.Country                  1265591 non-null  object 
 7   RegAddress.PostCode                 1265591 non-null  object 
 8   CompanyCategory                     1265591 non-null  object 
 9   CompanyStatus                       1265591 non-null  object 
 10  CountryOfOrigin 