In [1]:
import pandas as pd

# Import the CSV file as a dataframe
df = pd.read_csv('500_allinfo.csv')

# Display basic info
print(f"Total rows: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


Total rows: 501

Columns: ['company_number', 'data.address.address_line_1', 'data.address.address_line_2', 'data.address.country', 'data.address.locality', 'data.address.postal_code', 'data.address.premises', 'data.etag', 'data.identification.country_registered', 'data.identification.legal_authority', 'data.identification.legal_form', 'data.identification.place_registered', 'data.identification.registration_number', 'data.kind', 'data.links.self', 'data.name', 'data.natures_of_control', 'data.notified_on', 'data.country_of_residence', 'data.date_of_birth.month', 'data.date_of_birth.year', 'data.name_elements.forename', 'data.name_elements.middle_name', 'data.name_elements.surname', 'data.name_elements.title', 'data.nationality', 'data.address.region', 'data.ceased_on', 'data.address.care_of', 'data.address.po_box', 'sic_code', 'can_file', 'company_name', 'company_status', 'date_of_creation', 'etag', 'has_been_liquidated', 'has_charges', 'has_insolvency_history', 'jurisdiction', 'last_f

Unnamed: 0,company_number,data.address.address_line_1,data.address.address_line_2,data.address.country,data.address.locality,data.address.postal_code,data.address.premises,data.etag,data.identification.country_registered,data.identification.legal_authority,...,links.self,links.charges,links.filing_history,links.officers,registered_office_address.address_line_1,registered_office_address.address_line_2,registered_office_address.country,registered_office_address.locality,registered_office_address.postal_code,registered_office_address.region
0,7434180,Boddington Lane,Boddington,England,Cheltenham,GL51 0TJ,The Manor,706b6e8773700f85361929bb5cd0214d630de6e0,England & Wales,Companies Act,...,/company/07434180,/company/07434180/charges,/company/07434180/filing-history,/company/07434180/officers,The Manor Boddington Lane,Boddington,England,Cheltenham,GL51 0TJ,Gloucestershire
1,8810260,Godmans Lane,Kirk Ella,,Hull,HU10 7NY,Alvenga,224fe8afa8a6ff381ea0069dac5c6d4c995f15b2,,,...,/company/08810260,,/company/08810260/filing-history,/company/08810260/officers,Alvenga Godmans Lane,Kirk Ella,,Hull,HU10 7NY,
2,6022970,20 Rugby Road Milverton,Leamington Spa,,Warwickshire,CV32 6DG,,1840e605a8948c04246e8d9904da789e2fdc4a7e,,,...,/company/06022970,,/company/06022970/filing-history,/company/06022970/officers,20 Rugby Road Milverton,Leamington Spa,,Warwickshire,CV32 6DG,
3,8301480,Station Road,,England,Upminster,RM14 2SJ,"Suite 17, Essex House",0705b53fe3b34355fe183ae68013e8e1e19fda42,,,...,/company/08301480,,/company/08301480/filing-history,/company/08301480/officers,"Suite 17, Essex House",Station Road,England,Upminster,RM14 2SJ,Essex
4,7428111,Cottage,Ardens Grafton,,Alcester,B49 6DS,Malthouse,3ea7a8ff70ce98944c276648a875257e9b127428,,,...,/company/07428111,,/company/07428111/filing-history,/company/07428111/officers,Malthouse Cottage,Ardens Grafton,,Alcester,B49 6DS,Warwickshire


In [2]:
# Check if company_status is 'active' for each row
# Option 1: Create a boolean column indicating if status is active
df['is_active'] = df['company_status'] == 'active'

# Option 2: Filter to show only active companies
active_companies = df[df['company_status'] == 'active']

# Display results
print(f"Total companies: {len(df)}")
print(f"Active companies: {len(active_companies)}")
print(f"Non-active companies: {len(df) - len(active_companies)}")
print(f"\nCompany status distribution:")
print(df['company_status'].value_counts())

# Show first few rows with the is_active column
print(f"\nFirst few rows with is_active check:")
df[['company_number', 'company_name', 'company_status', 'is_active']].head()


Total companies: 501
Active companies: 439
Non-active companies: 62

Company status distribution:
company_status
active                    439
dissolved                  52
liquidation                 9
insolvency-proceedings      1
Name: count, dtype: int64

First few rows with is_active check:


Unnamed: 0,company_number,company_name,company_status,is_active
0,7434180,CASTLEGATE BUSINESS PARK LIMITED,active,True
1,8810260,WILLERFOSS HOMES LIMITED,active,True
2,6022970,PUDDLEDUCKS & P.O.S.H. LIMITED,active,True
3,8301480,EVEREST SALTS LIMITED,active,True
4,7428111,MALTHOUSE SOLUTIONS LIMITED,dissolved,False


In [3]:
# Filter to keep only active companies (delete non-active ones)
df_active = df[df['company_status'] == 'active'].copy()

# Remove the is_active column since all companies are active now
if 'is_active' in df_active.columns:
    df_active = df_active.drop(columns=['is_active'])

# Display info about the filtered dataframe
print(f"Original total companies: {len(df)}")
print(f"Active companies (after filtering): {len(df_active)}")
print(f"Companies removed: {len(df) - len(df_active)}")
print(f"\nFirst few rows of active companies:")
df_active[['company_number', 'company_name', 'company_status']].head()


Original total companies: 501
Active companies (after filtering): 439
Companies removed: 62

First few rows of active companies:


Unnamed: 0,company_number,company_name,company_status
0,7434180,CASTLEGATE BUSINESS PARK LIMITED,active
1,8810260,WILLERFOSS HOMES LIMITED,active
2,6022970,PUDDLEDUCKS & P.O.S.H. LIMITED,active
3,8301480,EVEREST SALTS LIMITED,active
6,7874121,R.A FLOORING RECRUITMENT LIMITED,active


In [4]:
# Save the filtered dataframe to a new CSV file
output_filename = '500_allinfo_active.csv'
df_active.to_csv(output_filename, index=False)

print(f"Saved {len(df_active)} active companies to '{output_filename}'")
print(f"File saved successfully!")


Saved 439 active companies to '500_allinfo_active.csv'
File saved successfully!
