In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [2]:
df = pd.read_csv(r'DDW-C18-0000.csv')  # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX
df = df[5:]  # STRIPPING UNWANTED ROWS
df.drop(df.columns[[1, 6, 7, 9, 10]], axis = 1, inplace = True)  # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','AreaName','Total/Rural/Urban','Age-group','Persons-Numberspeakingsecondlanguage','Persons-Numberspeakingthirdlanguage']

In [3]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']!="Total"]
df = df[df['Age-group']=='Total']

In [4]:
df = df.drop(['Age-group'], axis = 1)

In [5]:
df

Unnamed: 0,state/ut,AreaName,Total/Rural/Urban,Persons-Numberspeakingsecondlanguage,Persons-Numberspeakingthirdlanguage
15,00,INDIA,Rural,162641485,35383989
25,00,INDIA,Urban,152347285,50625591
45,01,JAMMU & KASHMIR,Rural,4167238,1258724
55,01,JAMMU & KASHMIR,Urban,2015952,837496
75,02,HIMACHAL PRADESH,Rural,981518,280817
...,...,...,...,...,...
1015,33,TAMIL NADU,Urban,12325853,1878494
1035,34,PUDUCHERRY,Rural,80981,5029
1045,34,PUDUCHERRY,Urban,311311,66636
1065,35,ANDAMAN & NICOBAR ISLANDS,Rural,152600,38136


In [6]:
dfp = pd.read_csv(r'DDW_PCA0000_2011_Indiastatedist.csv') # READING INDIA CENSUS DETAILS
dfp = dfp.iloc[:,6:13] # Extracting only required information from dataframe

In [7]:
dfp = dfp[dfp['TRU'] != 'Total']
dfp = dfp.drop(['No_HH','TOT_M','TOT_F'], axis = 1)
dfp = dfp[dfp['Level']!="DISTRICT"]

In [8]:
# CONVERTING LOWER CASE TO UPPER CASE
dfp['Level'] = dfp['Level'].str.upper()
dfp['Name'] = dfp['Name'].str.upper()

In [9]:
dfp = dfp.drop(['Level'], axis = 1)

In [10]:
dfp.columns = ['AreaName','Total/Rural/Urban','TOT_P']

In [11]:
dfp

Unnamed: 0,AreaName,Total/Rural/Urban,TOT_P
1,INDIA,Rural,833748852
2,INDIA,Urban,377106125
4,JAMMU & KASHMIR,Rural,9108060
5,JAMMU & KASHMIR,Urban,3433242
73,HIMACHAL PRADESH,Rural,6176050
...,...,...,...
1904,TAMIL NADU,Urban,34917440
2002,PUDUCHERRY,Rural,395200
2003,PUDUCHERRY,Urban,852753
2017,ANDAMAN & NICOBAR ISLANDS,Rural,237093


In [12]:
# MERGING BOTH DATAFRAMES
df_overall = df.merge(dfp, on=['AreaName', 'Total/Rural/Urban'])

In [13]:
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
df_overall = df_overall.astype({'Persons-Numberspeakingsecondlanguage':float,'Persons-Numberspeakingthirdlanguage':float,'TOT_P':float})
# COMPUTING PEOPLE SPEAKING ONE, EXACTLY TWO 2 LANGUAGES 
df_overall['Persons-Numberspeakingfirstlanguage'] = df_overall['TOT_P'] - df_overall['Persons-Numberspeakingsecondlanguage']
df_overall['Persons-Numberspeakingsecondlanguage'] = df_overall['TOT_P'] - df_overall['Persons-Numberspeakingthirdlanguage']
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
df_overall = df_overall.astype({'Persons-Numberspeakingfirstlanguage':float})
# TAKING PERCENTAGES BY DIVIDING WITH TOTAL POPULATION OF RESPECTIVE URBAN OR RURAL AREA
df_overall['percentage-3-lang'] = df_overall['Persons-Numberspeakingthirdlanguage']*100/df_overall['TOT_P']
df_overall['percentage-2-lang'] = df_overall['Persons-Numberspeakingsecondlanguage']*100/df_overall['TOT_P']
df_overall['percentage-1-lang'] = df_overall['Persons-Numberspeakingfirstlanguage']*100/df_overall['TOT_P']

In [14]:
df_overall

Unnamed: 0,state/ut,AreaName,Total/Rural/Urban,Persons-Numberspeakingsecondlanguage,Persons-Numberspeakingthirdlanguage,TOT_P,Persons-Numberspeakingfirstlanguage,percentage-3-lang,percentage-2-lang,percentage-1-lang
0,00,INDIA,Rural,798364863.0,35383989.0,833748852.0,671107367.0,4.243963,95.756037,80.492749
1,00,INDIA,Urban,326480534.0,50625591.0,377106125.0,224758840.0,13.424760,86.575240,59.600952
2,01,JAMMU & KASHMIR,Rural,7849336.0,1258724.0,9108060.0,4940822.0,13.819891,86.180109,54.246700
3,01,JAMMU & KASHMIR,Urban,2595746.0,837496.0,3433242.0,1417290.0,24.393736,75.606264,41.281389
4,02,HIMACHAL PRADESH,Rural,5895233.0,280817.0,6176050.0,5194532.0,4.546871,95.453129,84.107674
...,...,...,...,...,...,...,...,...,...,...
67,33,TAMIL NADU,Urban,33038946.0,1878494.0,34917440.0,22591587.0,5.379816,94.620184,64.700010
68,34,PUDUCHERRY,Rural,390171.0,5029.0,395200.0,314219.0,1.272520,98.727480,79.508856
69,34,PUDUCHERRY,Urban,786117.0,66636.0,852753.0,541442.0,7.814221,92.185779,63.493415
70,35,ANDAMAN & NICOBAR ISLANDS,Rural,198957.0,38136.0,237093.0,84493.0,16.084827,83.915173,35.637071


In [15]:
# COMPUTING REQUIRED INFORMATION FOR RURAL AND URBAN AREAS SEPARATELY
df_rural = df_overall[df_overall['Total/Rural/Urban']=="Rural"]
df_urban = df_overall[df_overall['Total/Rural/Urban']=="Urban"]
df_rural = df_rural.drop(['Total/Rural/Urban'], axis = 1)
df_urban = df_urban.drop(['Total/Rural/Urban'], axis = 1)
# MERGING URBAN AND RURAL AREAS INTO ONE DATAFRAME
df_overall = df_urban.merge(df_rural, on=['AreaName','state/ut'])
# AFTER MERGING SOME COLUMNS GOT APPENDED WITH x-> URBAN, y-> RURAL
df_overall = df_overall.astype({'TOT_P_x':float,'TOT_P_y':float,'percentage-3-lang_x':float,'percentage-3-lang_y':float,'percentage-2-lang_x':float,'percentage-2-lang_y':float,'percentage-1-lang_x':float,'percentage-1-lang_y':float})
# URBAN TO RURAL POPULATION RATIO
df_overall['utor-populationratio'] = df_overall['TOT_P_x']/df_overall['TOT_P_y']
# URBAN TO RURAL RATIO OF LANGUAGES 1,2,3+
df_overall['utor-3-lang'] = df_overall['percentage-3-lang_x'] / df_overall['percentage-3-lang_y']
df_overall['utor-2-lang'] = df_overall['percentage-2-lang_x'] / df_overall['percentage-2-lang_y']
df_overall['utor-1-lang'] = df_overall['percentage-1-lang_x'] / df_overall['percentage-1-lang_y']

In [16]:
df_overall

Unnamed: 0,state/ut,AreaName,Persons-Numberspeakingsecondlanguage_x,Persons-Numberspeakingthirdlanguage_x,TOT_P_x,Persons-Numberspeakingfirstlanguage_x,percentage-3-lang_x,percentage-2-lang_x,percentage-1-lang_x,Persons-Numberspeakingsecondlanguage_y,Persons-Numberspeakingthirdlanguage_y,TOT_P_y,Persons-Numberspeakingfirstlanguage_y,percentage-3-lang_y,percentage-2-lang_y,percentage-1-lang_y,utor-populationratio,utor-3-lang,utor-2-lang,utor-1-lang
0,0,INDIA,326480534.0,50625591.0,377106125.0,224758840.0,13.42476,86.57524,59.600952,798364863.0,35383989.0,833748852.0,671107367.0,4.243963,95.756037,80.492749,0.452302,3.163261,0.904123,0.740451
1,1,JAMMU & KASHMIR,2595746.0,837496.0,3433242.0,1417290.0,24.393736,75.606264,41.281389,7849336.0,1258724.0,9108060.0,4940822.0,13.819891,86.180109,54.2467,0.376945,1.765118,0.877305,0.760994
2,2,HIMACHAL PRADESH,622083.0,66469.0,688552.0,427365.0,9.653447,90.346553,62.067208,5895233.0,280817.0,6176050.0,5194532.0,4.546871,95.453129,84.107674,0.111487,2.123097,0.946502,0.737949
3,3,PUNJAB,6800103.0,3599043.0,10399146.0,4447102.0,34.609025,65.390975,42.764108,13113322.0,4230870.0,17344192.0,10261012.0,24.393584,75.606416,59.161084,0.599575,1.418776,0.864887,0.722842
4,4,CHANDIGARH,708833.0,317626.0,1026459.0,458262.0,30.943857,69.056143,44.64494,24638.0,4353.0,28991.0,17268.0,15.015005,84.984995,59.563313,35.406126,2.060862,0.812569,0.749538
5,5,UTTARAKHAND,2932127.0,117211.0,3049338.0,2126813.0,3.843818,96.156182,69.746712,6966343.0,70611.0,7036954.0,6201723.0,1.003431,98.996569,88.130788,0.433332,3.830674,0.971308,0.7914
6,6,HARYANA,8234499.0,607604.0,8842103.0,5972284.0,6.871714,93.128286,67.543705,15946392.0,562967.0,16509359.0,13751817.0,3.409987,96.590013,83.297098,0.535581,2.015173,0.964161,0.810877
7,7,NCT OF DELHI,15016140.0,1352759.0,16368899.0,9650398.0,8.264203,91.735797,58.955694,413568.0,5474.0,419042.0,320329.0,1.306313,98.693687,76.443173,39.062669,6.326357,0.9295,0.771236
8,8,RAJASTHAN,16650841.0,397244.0,17048085.0,13821925.0,2.330139,97.669861,81.076115,50890845.0,609507.0,51500352.0,47253326.0,1.183501,98.816499,91.753404,0.331029,1.968853,0.988396,0.883631
9,9,UTTAR PRADESH,43177247.0,1317816.0,44495063.0,32539351.0,2.961713,97.038287,73.13025,154049430.0,1267848.0,155317278.0,144399381.0,0.816296,99.183704,92.970584,0.286479,3.628236,0.978369,0.786596


In [17]:
# COMPUTING P VALUES
p_values = []
for index, row in df_overall.iterrows():
    a = [row['utor-3-lang'],row['utor-2-lang'],row['utor-1-lang']]
    b = [row['utor-populationratio'],row['utor-populationratio'],row['utor-populationratio']]
    ttest,p_value = ttest_ind(a,b,equal_var=False)
    p_values.append(p_value)

In [18]:
p_values

[0.2790102034939975,
 0.13948436751609206,
 0.1152293718293435,
 0.19837319735360037,
 0.00015585558893307493,
 0.28322695822161364,
 0.19439571623200386,
 0.0025085704268678886,
 0.11092821531908417,
 0.24108214560260635,
 0.13435878314237507,
 0.1526643082072778,
 0.08545585370814666,
 0.2336673968326268,
 0.08481153326761355,
 0.714558725052488,
 0.11051467674851494,
 0.3285278441502484,
 0.21168291273804674,
 0.3547485421734262,
 0.1458468701926638,
 0.25077776184686507,
 0.23968755553998164,
 0.23196734902384059,
 0.4143590924380973,
 0.0006042293020221918,
 0.1403028360322771,
 0.5060772366312816,
 0.29879951654595016,
 0.40252950364679463,
 0.013790293152851905,
 0.0004660010146191213,
 0.3575919150318061,
 0.4429096097413052,
 0.8156166045767077,
 0.18689802301117112]

In [19]:
df_overall['p-value']=p_values

In [20]:
final_df = df_overall[['state/ut','percentage-3-lang_x','percentage-2-lang_x','percentage-1-lang_x','percentage-3-lang_y','percentage-2-lang_y','percentage-1-lang_y','p-value']]
final_df.columns = [['state/ut','percentage-3-lang-urban','percentage-2-lang-urban','percentage-1-lang-urban','percentage-3-lang-rural','percentage-2-lang-rural','percentage-1-lang-rural','p-value']]
fc_df = final_df[['state/ut','percentage-3-lang-urban','percentage-3-lang-rural','p-value']]
fb_df = final_df[['state/ut','percentage-2-lang-urban','percentage-2-lang-rural','p-value']]
fa_df = final_df[['state/ut','percentage-1-lang-urban','percentage-1-lang-rural','p-value']]
fc_df.rename(columns={'percentage-3-lang-urban':'urban-percentage','percentage-3-lang-rural':'rural-percentage'},inplace=True)
fb_df.rename(columns={'percentage-2-lang-urban':'urban-percentage','percentage-2-lang-rural':'rural-percentage'},inplace=True)
fa_df.rename(columns={'percentage-1-lang-urban':'urban-percentage','percentage-1-lang-rural':'rural-percentage'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [21]:
#final_df.to_csv (r'geography-india.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE

In [22]:
fa_df.to_csv (r'geography-india-a.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE
fb_df.to_csv (r'geography-india-b.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE
fc_df.to_csv (r'geography-india-c.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE