In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv(r'DDW-C19-0000.csv') # C-19 POPULATION BY BILINGUALISM, TRILINGUALISM, EDUCATIONAL LEVEL AND SEX
df = df[5:] # STRIPPING UNWANTED ROWS
df = df[:-3] # STRIPPING LAST UNWANTED ROWS
df.drop(df.columns[[1, 5, 6, 7, 9, 10]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','AreaName','Total/Rural/Urban','literacy-group','Persons-Numberspeakingthirdlanguage']

In [8]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=="Total"]
df = df.drop(['Total/Rural/Urban'], axis = 1)
df = df.drop(df[df['literacy-group']=="Total"].index)

In [9]:
dfp = pd.read_csv(r'DDW-0000C-08.csv') # C-8  EDUCATIONAL LEVEL BY AGE AND SEX FOR POPULATION AGE 7 AND ABOVE - 2011
dfp = dfp[6:] # STRIPPING UNWANTED ROWS
# DROPPING UNWANTED COLUMNS
dfp.drop(dfp.columns[[0,2,6,7,8,10,11,13,14,16,17,19,20,22,23,25,26,28,29,31,32,34,35,37,38,40,41,42,43,44,45]], axis = 1, inplace = True)
# RENAMING COLUMNS WITH SUITABLE NAMES
dfp.columns = ['state/ut','state-name','Total/Rural/Urban','literacy-group','Illiterate','Literate','Literate_wel','Literate but below primary','Primary but below middle','Middle but below matric/secondary','Matric/Secondary but below graduate','Higher_secondary','Non_technical_diploma','Technical_diploma','Graduate and above']

In [10]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
dfp = dfp[dfp['Total/Rural/Urban']=="Total"]
dfp = dfp.drop(['Total/Rural/Urban'], axis = 1)
dfp = dfp[dfp['literacy-group']=="All ages"]
dfp = dfp.drop(['literacy-group'], axis = 1)
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
dfp = dfp.astype({'Illiterate':float,'Literate':float,'Literate_wel':float,'Literate but below primary':float,'Primary but below middle':float,'Middle but below matric/secondary':float,'Matric/Secondary but below graduate':float,'Higher_secondary':float,'Non_technical_diploma':float,'Technical_diploma':float,'Graduate and above':float})
#dfp['Literate'] = dfp['Literate']+dfp['Literate_wel']
#dfp = dfp.drop(['Literate_wel'], axis = 1)

dfp['Literate but below primary'] = dfp['Literate but below primary']+dfp['Literate_wel']
dfp = dfp.drop(['Literate_wel','Literate'],axis=1)

dfp['Matric/Secondary but below graduate'] = dfp['Matric/Secondary but below graduate']+dfp['Higher_secondary']+dfp['Non_technical_diploma']+dfp['Technical_diploma']
dfp = dfp.drop(['Higher_secondary','Non_technical_diploma','Technical_diploma'], axis = 1)

In [11]:
# CONVERTING COLUMNS TO ROWS BY MELT FUNCTION JUST TO MAKE OUR MERGING PROCESS EASY
dfp = dfp.melt(id_vars=["state/ut", "state-name"], var_name="literacy-group", value_name="Total persons")

In [12]:
# MERGING DATA FRAMES
merged_df = df.merge(dfp,on=["state/ut","literacy-group"])

In [13]:
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
merged_df = merged_df.astype({'Persons-Numberspeakingthirdlanguage':float,'Total persons':float})
merged_df['percentage'] = merged_df['Persons-Numberspeakingthirdlanguage']*100/merged_df['Total persons']

In [14]:
# FINDING LITERACY GROUP HAVING HIGHEST PERCENTAGE OF PEOPLE SPEAKING THREE LANGUAGES OR MORE
final_df = pd.DataFrame(columns = ['state/ut','literacy-group','percentage'])
for i in merged_df['state/ut'].unique():
    dfi = merged_df[merged_df['state/ut']==i]
    dfi = dfi.astype({'percentage':float}) 
    x = dfi.loc[dfi['percentage'].idxmax()]
    final_df = final_df.append(pd.DataFrame({"state/ut": x['state/ut'],"literacy-group":x['literacy-group'],'percentage':x['percentage']}, index=[0]))

In [15]:
final_df.to_csv(r'literacy-india.csv', index = None, header=True) # WRITING OUR DATAFRAME INTO OUTPUT CSV FILE