In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv(r'DDW-C18-0000.csv') # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX
df = df[5:]  # STRIPPING UNWANTED ROWS
df.drop(df.columns[[1, 5, 6, 7, 9, 10]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','AreaName','Total/Rural/Urban','age-group','Persons-Numberspeakingthirdlanguage']

In [23]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=="Total"]
df = df.drop(['Total/Rural/Urban'], axis = 1)
df = df.drop(df[df['age-group']=="Total"].index)
df = df.drop(df[df['age-group']=="Age not stated"].index)

In [24]:
dfp = pd.read_csv(r'DDW-0000C-14.csv') # C-14 POPULATION IN FIVE YEAR AGE-GROUP BY RESIDENCE AND SEX 
dfp = dfp[6:]  # STRIPPING UNWANTED ROWS
dfp.drop(dfp.columns[[0, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS
dfp.columns = ['state/ut','age-group','Persons-AgeGroup']  # RENAMING COLUMNS WITH SUITABLE NAMES
dfp = dfp.drop(dfp[dfp['age-group'].isin(["All ages", "Age not stated","0-4"])].index) # DROPPING UNWANTED COLUMNS
dfp = dfp.astype({'Persons-AgeGroup':float}) # CONVERTING INTO FLOAT DATA TYPE

In [25]:
# ADJUSTING AGE GROUPS INTO REQUIRED INTERVAL
for i in dfp['state/ut'].unique():
    dfi = dfp[dfp['state/ut']==i]
    dfi = dfi.astype({'Persons-AgeGroup':float})
    thirty_fourty_nine = dfi[dfi['age-group']=='30-34']['Persons-AgeGroup'].values[0] + dfi[dfi['age-group']=='35-39']['Persons-AgeGroup'].values[0] + dfi[dfi['age-group']=='40-44']['Persons-AgeGroup'].values[0] + dfi[dfi['age-group']=='45-49']['Persons-AgeGroup'].values[0]
    fifty_sixty_nine = dfi[dfi['age-group']=='50-54']['Persons-AgeGroup'].values[0] + dfi[dfi['age-group']=='55-59']['Persons-AgeGroup'].values[0]+ dfi[dfi['age-group']=='60-64']['Persons-AgeGroup'].values[0] + dfi[dfi['age-group']=='65-69']['Persons-AgeGroup'].values[0]
    seventy_plus = dfi[dfi['age-group']=='70-74']['Persons-AgeGroup'].values[0]+ dfi[dfi['age-group']=='75-79']['Persons-AgeGroup'].values[0] + dfi[dfi['age-group']=='80+']['Persons-AgeGroup'].values[0]
    dfp = dfp.append(pd.DataFrame({"state/ut": i,"age-group":"30-49",'Persons-AgeGroup':thirty_fourty_nine}, index=[0]))
    dfp = dfp.append(pd.DataFrame({"state/ut": i,"age-group":"50-69",'Persons-AgeGroup':fifty_sixty_nine}, index=[0]))
    dfp = dfp.append(pd.DataFrame({"state/ut": i,"age-group":"70+",'Persons-AgeGroup':seventy_plus}, index=[0]))

In [26]:
# RESETTING INDEX
dfp.reset_index(inplace = True)
dfp.drop(dfp.columns[[0]], axis = 1, inplace = True)
# DROPPING UNWANTED COLUMNS
dfp = dfp.drop(dfp[dfp['age-group'].isin(["30-34", "35-39","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80+"])].index)
dfp = dfp.sort_values(by=['state/ut'], ascending=True)
# MERGING DATA FRAMES
merged_df = df.merge(dfp, how='inner', left_on=["state/ut", "age-group"], right_on=["state/ut","age-group"])

In [27]:
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
merged_df = merged_df.astype({'Persons-Numberspeakingthirdlanguage':float,'Persons-AgeGroup':float})
# COMPUTING PERCENTAGE
merged_df['percentage'] = (merged_df['Persons-Numberspeakingthirdlanguage']*100/merged_df['Persons-AgeGroup'])

In [28]:
# FINDING AGE GROUP HAVING HIGHEST PERCENTAGE OF PEOPLE SPEAKING THREE LANGUAGES OR MORE
final_df = pd.DataFrame(columns = ['state/ut','age-group','percentage'])
for i in merged_df['state/ut'].unique():
    dfi = merged_df[merged_df['state/ut']==i]
    dfi = dfi.astype({'percentage':float}) 
    x = dfi.loc[dfi['percentage'].idxmax()]
    final_df = final_df.append(pd.DataFrame({"state/ut": x['state/ut'],"age-group":x['age-group'],'percentage':x['percentage']}, index=[0]))

In [29]:
final_df.to_csv(r'age-india.csv', index = None, header=True) # WRITING OUR DATAFRAME INTO OUTPUT CSV FILE