In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import plotly.express as px
import plotly.graph_objs as go

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
indianStates = "https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson"
data = pd.read_csv("/kaggle/input/all-indian-companies-registration-data-1900-2019/registered_companies.csv")
data.head()

In [None]:
print(f"There are {data.shape[0]} rows and {data.shape[1]} columns in the data.")

In [None]:
data.info()

# Missing values pattern

#### We got an elegant library [missingno](https://github.com/ResidentMario/missingno) to vizualize the missing data.

The white line in the below matrix shows a missing/NA/None value in a column. 

In [None]:
msno.matrix(data)

The bar graph makes more sense in understanding the exact numbers of missing value in our dataset.

In [None]:
msno.bar(data)

# Number of companies state-wise

In [None]:
companies_state = data.groupby("REGISTERED_STATE").size().reset_index(name='NO_OF_COMPANIES')

In [None]:
# Renaming the states as per GeoJSON file
rename_states = {"Andaman and Nicobar Islands": "Andaman & Nicobar",
                "Jammu and Kashmir": "Jammu & Kashmir",
                "Orissa" : "Odisha",
                "Chattisgarh": "Chhattisgarh",
                "Dadra and Nagra Haveli": "Dadra and Nagar Haveli and Daman and Diu",
                "Pondicherry": "Puducherry",
                "Uttaranchal": "Uttarakhand"}

companies_state.REGISTERED_STATE = companies_state.REGISTERED_STATE.replace(rename_states)

Let's merge **Dadra and Nagar Haveli** and **Daman and Diu**.

In [None]:
daman_diu_companies = companies_state.loc[companies_state['REGISTERED_STATE'] == "Daman and Diu"]['NO_OF_COMPANIES']
dadra_nagar_companies = companies_state.loc[companies_state['REGISTERED_STATE'] == "Dadra and Nagar Haveli and Daman and Diu"]['NO_OF_COMPANIES']
companies_state.loc[companies_state['REGISTERED_STATE'] == "Dadra and Nagar Haveli and Daman and Diu", 'NO_OF_COMPANIES'] = int(dadra_nagar_companies) + int(daman_diu_companies)

### Adding Ladakh to complete the map of beautiful India 🇮🇳

In [None]:
ladakh = pd.DataFrame([["Ladakh", 0.0]], columns=companies_state.columns)
companies_state = companies_state.append(ladakh, ignore_index=True)

In [None]:
fig = px.choropleth(
    companies_state,
    geojson=indianStates,
    featureidkey='properties.ST_NM',
    locations='REGISTERED_STATE',
    color_continuous_scale="Oranges",
    color='NO_OF_COMPANIES',
    title='Number of companies state-wise'
)
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

# Current status of companies

In [None]:
company_current_status = data.groupby("COMPANY_STATUS").size().reset_index(name='NO_OF_COMPANIES')

In [None]:
# The description of different status of companies. (Source: http://www.mca.gov.in/MinistryV2/)
status_description = {"ACTV": "Active",
          "NAEF": "Not available for e-filing",
          "ULQD": "Under liquidation",
          "AMAL": "Amalgamated",
          "STOF": "Strike off",
          "DISD": "Dissolved",
          "CLLD": "Converted to LLP and Dissolved",
          "UPSO": "Under process of Striking Off",
          "CLLP": "Converted to LLP",
          "LIQD": "Liquidated",
          "DRMT": "Dormant",
          "MLIQ": "Vanished",
          "D455": "Dormant under section 455"
         }

Let's map the company's status code to their description.

In [None]:
company_current_status.COMPANY_STATUS = company_current_status.COMPANY_STATUS.replace(status_description)

In [None]:
fig = px.pie(company_current_status, values='NO_OF_COMPANIES', names='COMPANY_STATUS', title='Companies Current Status', hole=.2)
fig.show()

# Number of registrations over the years

In [None]:
number_of_registration = data.copy()
number_of_registration = number_of_registration[['DATE_OF_REGISTRATION']]
number_of_registration.dropna(inplace=True)

In [None]:
number_of_registration['YEAR_OF_REGISTRATION'] = pd.to_datetime(number_of_registration['DATE_OF_REGISTRATION'], errors = 'coerce').dt.year
number_of_registration = number_of_registration.groupby("YEAR_OF_REGISTRATION").size().reset_index(name='NO_OF_COMPANIES')

In [None]:
fig = px.bar(number_of_registration, x='YEAR_OF_REGISTRATION', y='NO_OF_COMPANIES', title='Number of registrations over the years')
fig.show()

We can see a huge dip in the year of 2001. [Read more here](https://en.wikipedia.org/wiki/2001_in_India) why it happened.

# Top 20 companies with highest Authorized Capital and its Paidup Capital (INR)

In [None]:
highest_auth_capital_companies = data.sort_values(by='AUTHORIZED_CAP', ascending=False)[0:20]

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(y=highest_auth_capital_companies['COMPANY_NAME'],
                     x=highest_auth_capital_companies['AUTHORIZED_CAP'],
                     text=highest_auth_capital_companies['AUTHORIZED_CAP'],
                     name="Authorized Capital (INR)",
                    orientation='h'))
fig.add_trace(go.Bar(y=highest_auth_capital_companies['COMPANY_NAME'],
                     x=highest_auth_capital_companies['PAIDUP_CAPITAL'],
                     text=highest_auth_capital_companies['PAIDUP_CAPITAL'],
                     name="Paid Up Capital (INR)",
                    orientation='h'))

fig.update_layout(
    autosize=False,
    width=900,
    height=800,
    barmode='group',
    bargap=0.1,
    font=dict(size=8))
fig.update_traces(textposition='outside')
fig.show()

No wonder why Reliance dominate the market.

![](https://images.indianexpress.com/2016/09/giphy.gif)

# Principal Business Activity of a company as per CIN

In [None]:
company_business_activity = data.groupby("PRINCIPAL_BUSINESS_ACTIVITY_AS_PER_CIN").size().reset_index(name='NO_OF_COMPANIES')
fig = px.pie(company_business_activity, values='NO_OF_COMPANIES', names='PRINCIPAL_BUSINESS_ACTIVITY_AS_PER_CIN', title='Principal Business Activity of a company as per CIN (Hover to see the data)', hole=.2)
fig.update_layout(showlegend=False)
fig.update_traces(textposition='outside', textinfo='percent')
fig.show()

That is all for today 😊

Feel free to explore the dataset further.

![](https://media1.tenor.com/images/78bad6d059623c15d523c9055d6989b3/tenor.gif?itemid=16261518)