In [1]:
import pandas
from cleancorp import CleanCorp

In [2]:
def clean_business(input_string):
    business = CleanCorp(input_string)
    
    business_industry = business.industry
    clean_name = business.clean_name
    business_type = business.entity_type
    
    
    return pandas.Series([clean_name, business_type, business_industry])

In [3]:
clean_business('MARINE WORLD CARRIERS S. A.')

0         marine world carriers
1              [Public Limited]
2    [Transportation & Storage]
dtype: object

## Gov Data

In [4]:
test_data = pandas.read_csv('tests/test_data.csv', usecols=[0])
print('Entry Count:', test_data.shape[0])

Entry Count: 1319


In [5]:
test_data[['clean_name', 'type', 'industry']] = test_data.name.apply(clean_business)
test_data.sample(3)

Unnamed: 0,name,clean_name,type,industry
1205,Trade Wind Communications Ltd.,trade wind communications,[Private Limited],"[Wholesale & Retail, Information & Communicati..."
198,BSN Glasspack SAS,bsn glasspack,,
1138,TDK Corp.,tdk,[Corporation],


### Accuracy:
(Entry confirmed as a business)

In [6]:
test_data[test_data['type'].notna()].shape[0] / test_data.shape[0] * 100

97.64973464746019

## Panama Data

In [7]:
test_panama = pandas.read_csv('tests/panama_papers.csv')#, usecols=[0])
print('Entry Count:', test_panama.shape[0])

Entry Count: 41616


In [8]:
%%time 
test_panama[['clean_name', 'type', 'industry']] = test_panama['name'].apply(clean_business)

CPU times: user 33.6 s, sys: 553 ms, total: 34.2 s
Wall time: 34.1 s


In [9]:
test_panama.sample(3)

Unnamed: 0,name,clean_name,type,industry
3231,CRAWLEY HOLDING GROUP S.A.,crawley holding group,[Public Limited],[Finance and Insurance]
36437,THE PANAMERA 24 G PROPERTIES CORP.,the panamera 24 g properties,[Corporation],[Real estate activities]
29810,OTAGO INDUSTRIES INC.,otago industries,[Corporation],"[Manufacturing, Extraterritorial organisations]"


### Accuracy:
(Entry confirmed as a business)

In [10]:
test_panama[test_panama['type'].notna()].shape[0] / test_panama.shape[0] * 100

99.46174548250673

### Detected types:

In [11]:
test_panama.type = test_panama.type.apply(lambda x: str(x))
test_panama.groupby('type').size().sort_values(ascending=False)[:10]

type
['Corporation']                                         24224
['Public Limited']                                      14600
['Public Limited', 'Corporation']                         787
['Private Limited']                                       585
Unknown                                                   460
['Private Limited', 'Corporation']                        395
['Public Limited', 'Private Limited']                     228
None                                                      224
['Public Limited', 'Private Limited', 'Corporation']       20
['Non-Profit', 'Corporation']                              20
dtype: int64

### Detected industries:

In [12]:
test_panama.industry = test_panama.industry.apply(lambda x: str(x))
test_panama.groupby('industry').size().sort_values(ascending=False)[:10]

industry
None                                                           9483
['Finance and Insurance']                                      8647
['Extraterritorial organisations']                             4676
['Unknown']                                                    4032
['Scientific & Technical Professions']                         2956
['Real estate activities']                                     1995
['Wholesale & Retail']                                         1744
['Administrative & Support Services']                          1171
['Construction']                                                855
['Finance and Insurance', 'Extraterritorial organisations']     763
dtype: int64