In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv('scraped_data.csv')

In [4]:
data.head()

Unnamed: 0,odd,odd 2,odd 3,odd 4,odd 5,odd href,even,even 2,even 3,even 4,even 5,even href
0,"1 Brick Insurance, LLC",No\n Violation,2017-07-21,Vacate\n Suspension,13740-AG15-0303-063,https://www.in.gov/idoi/files/enforcement/1374...,,,,,,
1,,,,,,,1st and Goal Inc.,Failure to Respond,2009-05-21,Suspension of License,7980-AG09-0515-093,https://www.in.gov/idoi/files/enforcement/7980...
2,"35 Realserv, Inc.",Failure to Respond,2009-05-21,Suspension of License,7983-AG09-0515-094,https://www.in.gov/idoi/files/enforcement/7983...,,,,,,
3,,,,,,,"A & R Closing and Title, Inc.",Failure\n to Provide List of Appointments,2007-09-10,License\n Suspended,5601-AG07-0827-231,https://www.in.gov/idoi/files/enforcement/5601...
4,A. D. Baker & Company,Lack of\n Fitness/Trustworthiness,2000-12-13,Probation,168-IDOI-CO01-022,https://www.in.gov/idoi/files/enforcement/168-...,,,,,,


In [11]:
len(data)

5156

In [33]:
odd_rows = data[['odd', 'odd 2', 'odd 3', 'odd 4', 'odd 5', 'odd href']].dropna().reset_index(drop=True)
even_rows = data[['even', 'even 2', 'even 3', 'even 4', 'even 5', 'even href']].dropna().reset_index(drop=True)

# Renaming columns to have a consistent naming convention
odd_rows.columns = ['Company', 'Violation', 'Date', 'Action', 'Case Number', 'Link']
even_rows.columns = ['Company', 'Violation', 'Date', 'Action', 'Case Number', 'Link']

# Merging odd and even rows
merged_df = pd.concat([odd_rows, even_rows], ignore_index=True)

# Cleaning data by removing unnecessary newline and whitespace characters
merged_df.replace(to_replace=[r"\\t|\\n|\\r", "\t|\n|\r"], value=["",""], regex=True, inplace=True)

# Display the cleaned dataframe
merged_df.head()

Unnamed: 0,Company,Violation,Date,Action,Case Number,Link
0,"1 Brick Insurance, LLC",No Violation,2017-07-21,Vacate Suspension,13740-AG15-0303-063,https://www.in.gov/idoi/files/enforcement/1374...
1,"35 Realserv, Inc.",Failure to Respond,2009-05-21,Suspension of License,7983-AG09-0515-094,https://www.in.gov/idoi/files/enforcement/7983...
2,A. D. Baker & Company,Lack of Fitness/Trustworthiness,2000-12-13,Probation,168-IDOI-CO01-022,https://www.in.gov/idoi/files/enforcement/168-...
3,"A.F. Stevens & Associates, Inc.",Misappropriation of Premiums,2012-06-20,License permanently Revoked,11032-AG12-0321-020,https://www.in.gov/idoi/files/enforcement/1103...
4,AA Title,Licensing Violation,2013-11-08,$500 Civil Penalty,12463-AG13-1010-120,https://www.in.gov/idoi/files/enforcement/1246...


In [54]:
business_designations = ['Inc', 'LLC', 'Ltd', 'Corporation', 'Corp', 'Company', 'Co.', 'Group', 'Association', 'Foundation', 'Solutions', 'Services', 'Holdings', 'Partners', 'Part', 'Industries', 'International', 'Global', 'Systems', 'Technologies', 'tech']

# Function to determine if a name is likely a company name
def is_company_name(name):
    for designation in business_designations:
        if designation in name:
            return True
    return False

# Apply this function to the 'Company' column
merged_df['Is Company'] = merged_df['Company'].apply(is_company_name)

# Display a portion of the dataframe to see the results
merged_df.head()

Unnamed: 0,Company,Violation,Date,Action,Case Number,Link,Is Company
0,"1 Brick Insurance, LLC",No Violation,2017-07-21,Vacate Suspension,13740-AG15-0303-063,https://www.in.gov/idoi/files/enforcement/1374...,True
1,"35 Realserv, Inc.",Failure to Respond,2009-05-21,Suspension of License,7983-AG09-0515-094,https://www.in.gov/idoi/files/enforcement/7983...,True
2,A. D. Baker & Company,Lack of Fitness/Trustworthiness,2000-12-13,Probation,168-IDOI-CO01-022,https://www.in.gov/idoi/files/enforcement/168-...,True
3,"A.F. Stevens & Associates, Inc.",Misappropriation of Premiums,2012-06-20,License permanently Revoked,11032-AG12-0321-020,https://www.in.gov/idoi/files/enforcement/1103...,True
4,AA Title,Licensing Violation,2013-11-08,$500 Civil Penalty,12463-AG13-1010-120,https://www.in.gov/idoi/files/enforcement/1246...,False


In [55]:
merged_df['Is Company'].value_counts()

False    4490
True      666
Name: Is Company, dtype: int64

In [58]:
companies = merged_df[merged_df['Is Company'] == True]

In [60]:
len(companies)

666

In [61]:
companies.reset_index(drop=True, inplace=True)

In [62]:
len(companies)

666

In [64]:
companies.drop(columns=['Is Company'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  companies.drop(columns=['Is Company'], inplace=True)


In [65]:
companies.head()

Unnamed: 0,Company,Violation,Date,Action,Case Number,Link
0,"1 Brick Insurance, LLC",No Violation,2017-07-21,Vacate Suspension,13740-AG15-0303-063,https://www.in.gov/idoi/files/enforcement/1374...
1,"35 Realserv, Inc.",Failure to Respond,2009-05-21,Suspension of License,7983-AG09-0515-094,https://www.in.gov/idoi/files/enforcement/7983...
2,A. D. Baker & Company,Lack of Fitness/Trustworthiness,2000-12-13,Probation,168-IDOI-CO01-022,https://www.in.gov/idoi/files/enforcement/168-...
3,"A.F. Stevens & Associates, Inc.",Misappropriation of Premiums,2012-06-20,License permanently Revoked,11032-AG12-0321-020,https://www.in.gov/idoi/files/enforcement/1103...
4,"Absolute Title Services, LLC",Failure to enter RREAL transactions into data...,2016-06-30,Fine $810.00,15181-AG16-1531-125,https://www.in.gov/idoi/files/enforcement/1518...


In [66]:
companies.to_csv('companies.csv', index=False)