In [51]:
import pandas as pd
import numpy as np

In [52]:
df = pd.read_csv(r'DDW-C18-0000.csv') # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX
df = df[5:] # STRIPPING UNWANTED ROWS
df.drop(df.columns[[1, 5, 6, 7, 8]], axis = 1, inplace = True)  # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','AreaName','Total/Rural/Urban','age-group','Males-Numberspeakingthirdlanguage','Females-Numberspeakingthirdlanguage']

In [53]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=="Total"]
df = df.drop(['Total/Rural/Urban'], axis = 1)

In [54]:
df = df.drop(df[df['age-group']=="Total"].index)
df = df.drop(df[df['age-group']=="Age not stated"].index)

In [55]:
df[:20]

Unnamed: 0,state/ut,AreaName,age-group,Males-Numberspeakingthirdlanguage,Females-Numberspeakingthirdlanguage
6,0,INDIA,5-9,978151,865957
7,0,INDIA,10-14,3831131,3423204
8,0,INDIA,15-19,6792766,5833951
9,0,INDIA,20-24,7067614,5766720
10,0,INDIA,25-29,6144045,4713126
11,0,INDIA,30-49,16824473,10642411
12,0,INDIA,50-69,7385967,3530394
13,0,INDIA,70+,1362695,591844
36,1,JAMMU & KASHMIR,5-9,39782,33807
37,1,JAMMU & KASHMIR,10-14,121618,100968


In [56]:
dfp = pd.read_csv(r'DDW-0000C-14.csv') # C-14 POPULATION IN FIVE YEAR AGE-GROUP BY RESIDENCE AND SEX 
dfp = dfp[6:] # STRIPPING UNWANTED ROWS
dfp.drop(dfp.columns[[0, 2, 3, 5, 8, 9, 10, 11, 12, 13]], axis = 1, inplace = True)  # DROPPING UNWANTED COLUMNS
dfp.columns = ['state/ut','age-group','Males-AgeGroup','Females-AgeGroup'] # RENAMING COLUMNS WITH SUITABLE NAMES
dfp = dfp.drop(dfp[dfp['age-group'].isin(["All ages", "Age not stated","0-4"])].index)
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
dfp = dfp.astype({'Males-AgeGroup':float,'Females-AgeGroup':float})

In [57]:
# NOTE: THIS POPULATION DATAFRAME IS ADJUSTED TO SPECIFIC AGE INTERVALS THAT WE ARE WORKING UPON, THIS IS DONE ONLY ONCE
# AND HERE, SO WE ARE GONNA REUSE THE POPULATION DATAFRAME i.e dfp IN LATER PARTS WITHOUT COMPUTING AGAIN AND AGAIN.
for i in dfp['state/ut'].unique():
    dfi = dfp[dfp['state/ut']==i]
    # CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
    dfi = dfi.astype({'Males-AgeGroup':float})
    dfi = dfi.astype({'Females-AgeGroup':float})
    # ADJUSTING AGE GROUPS INTO REQUIRED INTERVAL
    thirty_fourty_nine_males = dfi[dfi['age-group']=='30-34']['Males-AgeGroup'].values[0] + dfi[dfi['age-group']=='35-39']['Males-AgeGroup'].values[0] + dfi[dfi['age-group']=='40-44']['Males-AgeGroup'].values[0] + dfi[dfi['age-group']=='45-49']['Males-AgeGroup'].values[0]
    thirty_fourty_nine_females = dfi[dfi['age-group']=='30-34']['Females-AgeGroup'].values[0] + dfi[dfi['age-group']=='35-39']['Females-AgeGroup'].values[0] + dfi[dfi['age-group']=='40-44']['Females-AgeGroup'].values[0] + dfi[dfi['age-group']=='45-49']['Females-AgeGroup'].values[0]
    fifty_sixty_nine_males = dfi[dfi['age-group']=='50-54']['Males-AgeGroup'].values[0] + dfi[dfi['age-group']=='55-59']['Males-AgeGroup'].values[0]+ dfi[dfi['age-group']=='60-64']['Males-AgeGroup'].values[0] + dfi[dfi['age-group']=='65-69']['Males-AgeGroup'].values[0]
    fifty_sixty_nine_females = dfi[dfi['age-group']=='50-54']['Females-AgeGroup'].values[0] + dfi[dfi['age-group']=='55-59']['Females-AgeGroup'].values[0]+ dfi[dfi['age-group']=='60-64']['Females-AgeGroup'].values[0] + dfi[dfi['age-group']=='65-69']['Females-AgeGroup'].values[0]
    seventy_plus_males = dfi[dfi['age-group']=='70-74']['Males-AgeGroup'].values[0]+ dfi[dfi['age-group']=='75-79']['Males-AgeGroup'].values[0] + dfi[dfi['age-group']=='80+']['Males-AgeGroup'].values[0]
    seventy_plus_females = dfi[dfi['age-group']=='70-74']['Females-AgeGroup'].values[0]+ dfi[dfi['age-group']=='75-79']['Females-AgeGroup'].values[0] + dfi[dfi['age-group']=='80+']['Females-AgeGroup'].values[0]
    dfp = dfp.append(pd.DataFrame({"state/ut": i,"age-group":"30-49",'Males-AgeGroup':thirty_fourty_nine_males,'Females-AgeGroup':thirty_fourty_nine_females}, index=[0]))
    dfp = dfp.append(pd.DataFrame({"state/ut": i,"age-group":"50-69",'Males-AgeGroup':fifty_sixty_nine_males,'Females-AgeGroup':fifty_sixty_nine_females}, index=[0]))
    dfp = dfp.append(pd.DataFrame({"state/ut": i,"age-group":"70+",'Males-AgeGroup':seventy_plus_males,'Females-AgeGroup':seventy_plus_females}, index=[0]))

In [58]:
# RESETTING INDEX
dfp.reset_index(inplace = True)
dfp.drop(dfp.columns[[0]], axis = 1, inplace = True)
# DROPPING UNWANTED COLUMNS
dfp = dfp.drop(dfp[dfp['age-group'].isin(["30-34", "35-39","40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80+"])].index)
dfp = dfp.sort_values(by=['state/ut'], ascending=True)
# MERGING DATA FRAMES
merged_df = df.merge(dfp, how='inner', left_on=["state/ut", "age-group"], right_on=["state/ut","age-group"])

In [59]:
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
merged_df = merged_df.astype({'Males-Numberspeakingthirdlanguage':float,'Females-Numberspeakingthirdlanguage':float,'Males-AgeGroup':float,'Females-AgeGroup':float})
merged_df['males-ratio-of-3'] = (merged_df['Males-Numberspeakingthirdlanguage']/merged_df['Males-AgeGroup'])
merged_df['females-ratio-of-3'] = (merged_df['Females-Numberspeakingthirdlanguage']/merged_df['Females-AgeGroup'])

In [60]:
# age group separately for males and females that has the highest ratio of population that can speak 3 or more languages
final_df = pd.DataFrame(columns = ['state/ut','age-group-males','ratio-males','age-group-females','ratio-females'])
for i in merged_df['state/ut'].unique():
    dfi = merged_df[merged_df['state/ut']==i]
    dfi = dfi.astype({'males-ratio-of-3':float,'females-ratio-of-3':float})
    x = dfi.loc[dfi['males-ratio-of-3'].idxmax()]
    y = dfi.loc[dfi['females-ratio-of-3'].idxmax()]
    final_df = final_df.append(pd.DataFrame({"state/ut": i,"age-group-males":x['age-group'],'ratio-males':x['males-ratio-of-3'],"age-group-females":y['age-group'],'ratio-females':y['females-ratio-of-3']}, index=[0]))

In [61]:
final_df.to_csv(r'age-gender-a.csv', index = None, header=True) # WRITING OUR DATAFRAME INTO OUTPUT CSV FILE

In [62]:
df = pd.read_csv(r'DDW-C18-0000.csv')  # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX
df = df[5:] # STRIPPING UNWANTED ROWS
df.drop(df.columns[[1, 5, 8]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','AreaName','Total/Rural/Urban','age-group','Males-Numberspeakingsecondlanguage','Females-Numberspeakingsecondlanguage','Males-Numberspeakingthirdlanguage','Females-Numberspeakingthirdlanguage']
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
df = df.astype({'Males-Numberspeakingsecondlanguage':float,'Males-Numberspeakingthirdlanguage':float,'Females-Numberspeakingsecondlanguage':float,'Females-Numberspeakingthirdlanguage':float})
df['Males-Numberspeakingsecondlanguage'] = df['Males-Numberspeakingsecondlanguage']-df['Males-Numberspeakingthirdlanguage']
df['Females-Numberspeakingsecondlanguage'] = df['Females-Numberspeakingsecondlanguage']-df['Females-Numberspeakingthirdlanguage']
df.drop(df.columns[[6,7]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS

In [63]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=="Total"]
df = df.drop(['Total/Rural/Urban'], axis = 1)

In [64]:
df = df.drop(df[df['age-group']=="Total"].index)
df = df.drop(df[df['age-group']=="Age not stated"].index)

In [65]:
df[:20]

Unnamed: 0,state/ut,AreaName,age-group,Males-Numberspeakingsecondlanguage,Females-Numberspeakingsecondlanguage
6,0,INDIA,5-9,7188692.0,6616392.0
7,0,INDIA,10-14,14302292.0,12931865.0
8,0,INDIA,15-19,15958142.0,13839740.0
9,0,INDIA,20-24,15319080.0,13190992.0
10,0,INDIA,25-29,13550988.0,11515886.0
11,0,INDIA,30-49,39041847.0,29935139.0
12,0,INDIA,50-69,16920768.0,11892653.0
13,0,INDIA,70+,3450976.0,2556832.0
36,1,JAMMU & KASHMIR,5-9,194915.0,168039.0
37,1,JAMMU & KASHMIR,10-14,365740.0,317044.0


In [66]:
# MERGING DATA FRAMES
merged_df = df.merge(dfp, how='inner', left_on=["state/ut", "age-group"], right_on=["state/ut","age-group"])
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
merged_df = merged_df.astype({'Males-Numberspeakingsecondlanguage':float,'Females-Numberspeakingsecondlanguage':float,'Males-AgeGroup':float,'Females-AgeGroup':float})
merged_df['males-ratio-of-2'] = (merged_df['Males-Numberspeakingsecondlanguage']/merged_df['Males-AgeGroup'])
merged_df['females-ratio-of-2'] = (merged_df['Females-Numberspeakingsecondlanguage']/merged_df['Females-AgeGroup'])

In [67]:
# age group separately for males and females that has the highest ratio of population that can speak exactly 2 languages
final_df = pd.DataFrame(columns =['state/ut','age-group-males','ratio-males','age-group-females','ratio-females'])
for i in merged_df['state/ut'].unique():
    dfi = merged_df[merged_df['state/ut']==i]
    dfi = dfi.astype({'males-ratio-of-2':float,'females-ratio-of-2':float})
    x = dfi.loc[dfi['males-ratio-of-2'].idxmax()]
    y = dfi.loc[dfi['females-ratio-of-2'].idxmax()]
    final_df = final_df.append(pd.DataFrame({"state/ut": i,"age-group-males":x['age-group'],'ratio-males':x['males-ratio-of-2'],"age-group-females":y['age-group'],'ratio-females':y['females-ratio-of-2']}, index=[0]))

In [68]:
final_df.to_csv(r'age-gender-b.csv', index = None, header=True) # WRITING OUR DATAFRAME INTO OUTPUT CSV FILE

In [69]:
df = pd.read_csv(r'DDW-C18-0000.csv') # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX
df = df[5:]   # STRIPPING UNWANTED ROWS
df.drop(df.columns[[1, 5, 8]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','AreaName','Total/Rural/Urban','age-group','Males-Numberspeakingsecondlanguage','Females-Numberspeakingsecondlanguage','Males-Numberspeakingthirdlanguage','Females-Numberspeakingthirdlanguage']
df.drop(df.columns[[6,7]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS

In [70]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=="Total"]
df = df.drop(['Total/Rural/Urban'], axis = 1)

In [71]:
df = df.drop(df[df['age-group']=="Total"].index)
df = df.drop(df[df['age-group']=="Age not stated"].index)

In [72]:
df[:20]

Unnamed: 0,state/ut,AreaName,age-group,Males-Numberspeakingsecondlanguage,Females-Numberspeakingsecondlanguage
6,0,INDIA,5-9,8166843,7482349
7,0,INDIA,10-14,18133423,16355069
8,0,INDIA,15-19,22750908,19673691
9,0,INDIA,20-24,22386694,18957712
10,0,INDIA,25-29,19695033,16229012
11,0,INDIA,30-49,55866320,40577550
12,0,INDIA,50-69,24306735,15423047
13,0,INDIA,70+,4813671,3148676
36,1,JAMMU & KASHMIR,5-9,234697,201846
37,1,JAMMU & KASHMIR,10-14,487358,418012


In [73]:
# MERGING DATA FRAMES
merged_df = df.merge(dfp, how='inner', left_on=["state/ut", "age-group"], right_on=["state/ut","age-group"])
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
merged_df = merged_df.astype({'Males-Numberspeakingsecondlanguage':float,'Females-Numberspeakingsecondlanguage':float,'Males-AgeGroup':float,'Females-AgeGroup':float})
# COMPUTING PEOPLE SPEAKING EXACTLY ONE LANGUAGE
merged_df['Males-Numberspeakingfirstlanguage'] = merged_df['Males-AgeGroup']-merged_df['Males-Numberspeakingsecondlanguage']
merged_df['Females-Numberspeakingfirstlanguage'] = merged_df['Females-AgeGroup']-merged_df['Females-Numberspeakingsecondlanguage']
# DROPPING UNWANTED COLUMNS
merged_df = merged_df.drop(['Males-Numberspeakingsecondlanguage','Females-Numberspeakingsecondlanguage'], axis = 1)
merged_df['males-ratio-of-1'] = (merged_df['Males-Numberspeakingfirstlanguage']/merged_df['Males-AgeGroup'])
merged_df['females-ratio-of-1'] = (merged_df['Females-Numberspeakingfirstlanguage']/merged_df['Females-AgeGroup'])

In [74]:
# age group separately for males and females that has the highest ratio of population that can speak exactly one language
final_df = pd.DataFrame(columns =['state/ut','age-group-males','ratio-males','age-group-females','ratio-females'])
for i in merged_df['state/ut'].unique():
    dfi = merged_df[merged_df['state/ut']==i]
    dfi = dfi.astype({'males-ratio-of-1':float,'females-ratio-of-1':float})
    x = dfi.loc[dfi['males-ratio-of-1'].idxmax()]
    y = dfi.loc[dfi['females-ratio-of-1'].idxmax()]
    final_df = final_df.append(pd.DataFrame({"state/ut": i,"age-group-males":x['age-group'],'ratio-males':x['males-ratio-of-1'],"age-group-females":y['age-group'],'ratio-females':y['females-ratio-of-1']}, index=[0]))

In [75]:
final_df.to_csv(r'age-gender-c.csv', index = None, header=True) # WRITING OUR DATAFRAME INTO OUTPUT CSV FILE