In [12]:
import pandas
from cleancorp import CleanCorp

In [13]:
def clean_business(input_string):
    business = CleanCorp(input_string)
    
    business_industry = business.industry
    clean_name = business.clean_name
    business_type = business.entity_type
    
    
    return pandas.Series([clean_name, business_type, business_industry])

In [24]:
clean_business('MARINE WORLD CARRIERS S. A.')

0         marine world carriers
1              [Public Limited]
2    [Transportation & Storage]
dtype: object

## Gov Data

In [14]:
test_data = pandas.read_csv('tests/test_data.csv', usecols=[0])
print('Entry Count:', test_data.shape[0])

Entry Count: 1319


In [15]:
test_data[['clean_name', 'type', 'industry']] = test_data.name.apply(clean_business)
test_data.sample(3)

Unnamed: 0,name,clean_name,type,industry
1252,UPM Kymmene Miramichi Inc.,upm kymmene miramichi,[Corporation],
792,Mind C.T.I. Ltd.,mind c.t.i.,[Private Limited],[Human Health & Social Work]
879,Olympic Resources Ltd.,olympic resources,[Private Limited],[Mining & Quarrying]


### Accuracy:
(Entry confirmed as a business)

In [16]:
test_data[test_data['type'].notna()].shape[0] / test_data.shape[0] * 100

97.64973464746019

## Panama Data

In [18]:
test_panama = pandas.read_csv('tests/panama_papers.csv')#, usecols=[0])
print('Entry Count:', test_panama.shape[0])

Entry Count: 41616


In [28]:
%%time 
test_panama[['clean_name', 'type', 'industry']] = test_panama['name'].apply(clean_business)

CPU times: user 42.2 s, sys: 296 ms, total: 42.5 s
Wall time: 42.5 s


In [20]:
test_panama.sample(3)

Unnamed: 0,name,clean_name,type,industry
11085,FREEPORT LATIN AMERICA S.A.,freeport latin america,[Public Limited],
10195,PLEASANT INTERNATIONAL S.A.,pleasant international,[Public Limited],[Extraterritorial organisations]
7617,ROLDAN FINANCE S.A,roldan finance,[Private Limited],[Finance and Insurance]


### Accuracy:
(Entry confirmed as a business)

In [21]:
test_panama[test_panama['type'].notna()].shape[0] / test_panama.shape[0] * 100

99.46174548250673

### Detected types:

In [27]:
test_panama.type = test_panama.type.apply(lambda x: str(x))
test_panama.groupby('type').size().sort_values(ascending=False)[:10]

type
['Corporation']                                         24224
['Public Limited']                                      14600
['Public Limited', 'Corporation']                         787
['Private Limited']                                       585
Unknown                                                   460
['Private Limited', 'Corporation']                        395
['Public Limited', 'Private Limited']                     228
None                                                      224
['Public Limited', 'Private Limited', 'Corporation']       20
['Non-Profit', 'Corporation']                              20
dtype: int64

### Detected industries:

In [26]:
test_panama.industry = test_panama.industry.apply(lambda x: str(x))
test_panama.groupby('industry').size().sort_values(ascending=False)[:10]

industry
None                                                           9483
['Finance and Insurance']                                      8647
['Extraterritorial organisations']                             4676
['Unknown']                                                    4032
['Scientific & Technical Professions']                         2956
['Real estate activities']                                     1995
['Wholesale & Retail']                                         1744
['Administrative & Support Services']                          1171
['Construction']                                                855
['Finance and Insurance', 'Extraterritorial organisations']     763
dtype: int64