In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.read_csv(r'DDW-C18-0000.csv') # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX

In [35]:
df = df[5:] # STRIPPING UNWANTED ROWS

In [36]:
df.drop(df.columns[[1, 6, 7, 9, 10]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS

In [37]:
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['Statecode','AreaName','Total/Rural/Urban','Age-group','Persons-Numberspeakingsecondlanguage','Persons-Numberspeakingthirdlanguage']

In [38]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=='Total']
df = df[df['Age-group']=='Total']
df = df.drop(['Total/Rural/Urban','Age-group'], axis = 1)

In [60]:
df.head()

Unnamed: 0,Statecode,AreaName,Persons-Numberspeakingsecondlanguage,Persons-Numberspeakingthirdlanguage
5,0,INDIA,314988770,86009580
35,1,JAMMU & KASHMIR,6183190,2096220
65,2,HIMACHAL PRADESH,1242705,347286
95,3,PUNJAB,13035224,7829913
125,4,CHANDIGARH,579920,321979


In [40]:
# READING INDIA CENSUS DETAILS
dfp = pd.read_csv(r'DDW_PCA0000_2011_Indiastatedist.csv') 

In [41]:
dfp = dfp.iloc[:,6:13] # Extracting only required information from dataframe

In [42]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
dfp = dfp[dfp['TRU'] == 'Total']
dfp = dfp.drop(['TRU','No_HH','TOT_M','TOT_F'], axis = 1)

In [43]:
dfp_state = dfp[dfp['Level']=='STATE']
dfp_state = dfp_state.drop(['Level'],axis=1)

In [44]:
# EXTRACTING INDIA ROW
dfp_india = dfp[dfp['Level']=='India']
dfp_india = dfp_india.drop(['Level'],axis=1)
dfp_india['Name'] = "INDIA"

In [45]:
# CONCATENATING INDIA AND STATES INTO ONE DATAFRAME
dfp_total = pd.concat([dfp_india, dfp_state], ignore_index=True)

In [46]:
dfp_total.rename(columns = {'Name':'AreaName'}, inplace = True)

In [47]:
# MERGING TWO DATAFRAMES ON AREA NAME
df_overall = pd.merge(df, dfp_total, on='AreaName')

In [61]:
df_overall.head()

Unnamed: 0,state-code,percent-one,percent-two,percent-three
0,0,73.986251,18.910538,7.103211
1,1,50.697384,32.588084,16.714533
2,2,81.896911,13.044005,5.059084
3,3,53.01494,18.762382,28.222678
4,4,45.054716,24.43896,30.506324


In [49]:
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
df_overall = df_overall.astype({'TOT_P':int,'Persons-Numberspeakingsecondlanguage':int,'Persons-Numberspeakingthirdlanguage':int})

In [50]:
# COMPUTING PEOPLE SPEAKING ONLY ONE LANGUAGE
df_overall['onelanguage'] = (df_overall['TOT_P'] - df_overall['Persons-Numberspeakingsecondlanguage'])/df_overall['TOT_P']
df_overall['onelanguage'] = df_overall['onelanguage']*100

In [51]:
# COMPUTING PEOPLE SPEAKING EXACTLY TWO LANGUAGES
df_overall['twolanguages'] = (df_overall['Persons-Numberspeakingsecondlanguage'] - df_overall['Persons-Numberspeakingthirdlanguage'])/df_overall['TOT_P']
df_overall['twolanguages'] = df_overall['twolanguages']*100

In [52]:
# COMPUTING PEOPLE SPEAKING MORE THAN OR EQUAL TO TWO LANGUAGES
df_overall['threelanguages'] = (df_overall['Persons-Numberspeakingthirdlanguage'])/df_overall['TOT_P']
df_overall['threelanguages'] = df_overall['threelanguages']*100

In [53]:
# DROPPING UNREQUIRED COLUMNS
df_overall = df_overall.drop(['Statecode','Persons-Numberspeakingsecondlanguage','Persons-Numberspeakingthirdlanguage','TOT_P'], axis = 1)

In [54]:
df_overall

Unnamed: 0,AreaName,onelanguage,twolanguages,threelanguages
0,INDIA,73.986251,18.910538,7.103211
1,JAMMU & KASHMIR,50.697384,32.588084,16.714533
2,HIMACHAL PRADESH,81.896911,13.044005,5.059084
3,PUNJAB,53.01494,18.762382,28.222678
4,CHANDIGARH,45.054716,24.43896,30.506324
5,UTTARAKHAND,82.572823,15.565026,1.862151
6,HARYANA,77.802617,17.580012,4.617371
7,NCT OF DELHI,59.392197,32.517275,8.090528
8,RAJASTHAN,89.097948,9.433381,1.468671
9,UTTAR PRADESH,88.552454,10.153499,1.294046


In [55]:
# STATE NAMES TO CODES DICTIONARY
state_codes = { 'INDIA': '00',
 'JAMMU & KASHMIR': '01',
 'HIMACHAL PRADESH': '02',
 'PUNJAB': '03',
 'CHANDIGARH': '04',
 'UTTARAKHAND': '05',
 'HARYANA': '06',
 'NCT OF DELHI': '07',
 'RAJASTHAN': '08',
 'UTTAR PRADESH': '09',
 'BIHAR': '10',
 'SIKKIM': '11',
 'ARUNACHAL PRADESH': '12',
 'NAGALAND': '13',
 'MANIPUR': '14',
 'MIZORAM': '15',
 'TRIPURA': '16',
 'MEGHALAYA': '17',
 'ASSAM': '18',
 'WEST BENGAL': '19',
 'JHARKHAND': '20',
 'ODISHA': '21',
 'CHHATTISGARH': '22',
 'MADHYA PRADESH': '23',
 'GUJARAT': '24',
 'DAMAN & DIU': '25',
 'DADRA & NAGAR HAVELI': '26',
 'MAHARASHTRA': '27',
 'ANDHRA PRADESH': '28',
 'KARNATAKA': '29',
 'GOA': '30',
 'LAKSHADWEEP': '31',
 'KERALA': '32',
 'TAMIL NADU': '33',
 'PUDUCHERRY': '34',
 'ANDAMAN & NICOBAR ISLANDS': '35'}

In [56]:
# USING STATE CODES INSTEAD OF STATE NAMES
df_overall['AreaName'] = df_overall['AreaName'].replace(state_codes)

In [57]:
df_overall = df_overall.sort_values(by=['AreaName'], ascending=True)

In [58]:
df_overall.columns = [['state-code', 'percent-one', 'percent-two', 'percent-three']]

In [59]:
df_overall.to_csv (r'percent-india.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE