In [18]:
import pandas as pd
import re

# Load the CSV file
file_path = 'bianlian.csv'
df = pd.read_csv(file_path)

# Define a function to extract the relevant fields from the "Content" column
def extract_info(content):
    # Split content by lines
    lines = content.split('\n')
    
    # Extracting the company name from the first line
    company_name = lines[0].strip() if len(lines) > 0 else None
    
    # Extracting the company website from the second line
    website = lines[1].strip() if len(lines) > 1 else None

    # Check if the website starts with "https", otherwise set to None
    if website and not website.startswith("http"):
        website = None

    # Extracting the description from the third line
    description = lines[2].strip() if len(lines) > 2 else None
    
    # Extracting the revenue from the line that contains 'Revenue:'
    revenue = None
    data_volume = None
    data_type = None
    industry = None
    for line in lines:
        if 'Revenue:' in line:
            revenue = line.split('Revenue:')[-1].strip()
            
        if "Data volume" in line:
            data_volume = line.split('Data volume:')[-1].strip()
            
        if "Data Volume" in line:
            data_volume = line.split('Data Volume:')[-1].strip()

    return pd.Series([company_name, website, description, industry, revenue, data_volume, data_type])

# Apply the extraction function to the "Content" column
extracted_df = df['Content'].apply(extract_info)
extracted_df.columns = ['Company Name', 'Company Website', 'Description', 'Industry', 'Revenue', 'Data Volume', 'Data Type']

# Add the original content column to the extracted dataframe
extracted_df['Original Content'] = df['Content']

# Display the cleaned dataframe
display(extracted_df)

# Save the cleaned dataframe to a new CSV if needed
extracted_df.to_csv('extracted_info.csv', index=False)


Unnamed: 0,Company Name,Company Website,Description,Industry,Revenue,Data Volume,Data Type,Original Content
0,Benson Kearley IFG - Insurance Brokers & Finan...,http://bkifg.com,Benson Kearley IFG offers personal and commerc...,,> $20 Millions,1.4 TB,,Benson Kearley IFG - Insurance Brokers & Finan...
1,"Law Offices of Michael J Gurfinkel, Inc",https://gurfinkel.com,"The Law Offices of Michael J. Gurfinkel, Inc. ...",,< $5 Millions,400 Gb,,"Law Offices of Michael J Gurfinkel, Inc\nhttps..."
2,Stein Fibers,https://steinfibers.com,"Founded in 1976, Stein Fibers, Ltd. is a suppl...",,,,,Stein Fibers\nhttps://steinfibers.com\nFounded...
3,Majestic Metals,https://fathommfg.com/fathom-denver,Majestic Metals is a precision sheet metal fab...,,$24.1 Millions,735 GB,,Majestic Metals\nhttps://fathommfg.com/fathom-...
4,Ladov Law Firm,,Revenue: <$5 Millions,,<$5 Millions,105 GB,,Ladov Law Firm\nLadov Law Firm PC is a company...
...,...,...,...,...,...,...,...,...
89,"Mayer Antonellis Jachowicz & Haranas, LLP",https://hkwg.com,"The Massachusetts law firm of Mayer & Haranas,...",,> $5 Millions,170 Gb,,"Mayer Antonellis Jachowicz & Haranas, LLP\nhtt..."
90,Shoma group,https://www.shomagroup.com/,Shoma Group is a real estate development compa...,,> $20 Millions,2 Tb,,Shoma group\nhttps://www.shomagroup.com/\nShom...
91,Lindsay Municipal Hospital,http://lindsayhospital.com,Lindsay Municipal Hospital proudly serves the ...,,$13 Millions,,,Lindsay Municipal Hospital\nhttp://lindsayhosp...
92,"Palmer Construction Co., Inc",https://palmerconstruction.net,"Palmer Construction is a design-build, general...",,> $13 Millions,475 Gb,,"Palmer Construction Co., Inc\nhttps://palmerco..."
