In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [2]:
df = pd.read_csv(r'DDW-C18-0000.csv') # READING C-18 POPULATION BY BILINGUALISM, TRILINGUALISM, AGE AND SEX
df = df[5:]  # STRIPPING UNWANTED ROWS
df.drop(df.columns[[1, 5, 8]], axis = 1, inplace = True) # DROPPING UNWANTED COLUMNS
# RENAMING COLUMNS WITH SUITABLE NAMES
df.columns = ['state/ut','Name','Total/Rural/Urban','Age-group','Males-Numberspeakingsecondlanguage','Females-Numberspeakingsecondlanguage','Males-Numberspeakingthirdlanguage','Females-Numberspeakingthirdlanguage']

In [3]:
# EXTRACTING ONLY REQUIRED ROWS AND THEN DROPPING UNWANTED COLUMNS
df = df[df['Total/Rural/Urban']=='Total']
df = df[df['Age-group']=='Total']

In [4]:
df = df.drop(['Total/Rural/Urban','Age-group'], axis = 1)

In [5]:
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
df = df.astype({'Males-Numberspeakingsecondlanguage':float,'Females-Numberspeakingsecondlanguage':float,'Males-Numberspeakingthirdlanguage':float,'Females-Numberspeakingthirdlanguage':float})
df['Males-Numberspeakingsecondlanguage'] = df['Males-Numberspeakingsecondlanguage']-df['Males-Numberspeakingthirdlanguage']
df['Females-Numberspeakingsecondlanguage'] = df['Females-Numberspeakingsecondlanguage']-df['Females-Numberspeakingthirdlanguage']

In [6]:
df.head()

Unnamed: 0,state/ut,Name,Males-Numberspeakingsecondlanguage,Females-Numberspeakingsecondlanguage,Males-Numberspeakingthirdlanguage,Females-Numberspeakingthirdlanguage
5,0,INDIA,126159551.0,102819639.0,50536832.0,35472748.0
35,1,JAMMU & KASHMIR,2376063.0,1710907.0,1258756.0,837464.0
65,2,HIMACHAL PRADESH,483488.0,411931.0,200478.0,146808.0
95,3,PUNJAB,2900342.0,2304969.0,4383841.0,3446072.0
125,4,CHANDIGARH,144341.0,113600.0,178557.0,143422.0


In [7]:
dfp = pd.read_csv(r'DDW_PCA0000_2011_Indiastatedist.csv')  # READING INDIA CENSUS DETAILS

In [8]:
dfp = dfp.iloc[:,6:13] # Slicing unuseful info from dataframe
dfp = dfp[dfp['TRU'] == 'Total']

# EXTRACTING INDIA ROW
dfp_india = dfp[dfp['Level']=='India']
dfp_india = dfp_india.drop(['Level','TRU','No_HH','TOT_P'],axis=1)
dfp_india['Name'] = "INDIA"

dfp = dfp[dfp['Level'] == 'STATE']
dfp = dfp.drop(['Level','TRU','No_HH','TOT_P'], axis = 1)

dfp = pd.concat([dfp_india, dfp], ignore_index=True)

In [9]:
dfp.iloc[:20]

Unnamed: 0,Name,TOT_M,TOT_F
0,INDIA,623270258,587584719
1,JAMMU & KASHMIR,6640662,5900640
2,HIMACHAL PRADESH,3481873,3382729
3,PUNJAB,14639465,13103873
4,CHANDIGARH,580663,474787
5,UTTARAKHAND,5137773,4948519
6,HARYANA,13494734,11856728
7,NCT OF DELHI,8987326,7800615
8,RAJASTHAN,35550997,32997440
9,UTTAR PRADESH,104480510,95331831


In [10]:
final_df = pd.merge(df, dfp, on='Name') # MERGING BOTH DATAFRAMES
# CONVERTING REQUIRED COLUMNS INTO SPECIFIC DATA TYPE
final_df = final_df.astype({'Males-Numberspeakingsecondlanguage':float,'Females-Numberspeakingsecondlanguage':float,'Males-Numberspeakingthirdlanguage':float,'Females-Numberspeakingthirdlanguage':float,'TOT_M':float,'TOT_F':float})
# PERCENTAGE OF MALES/FEMALES SPEAKING 1 OR 2 OR 3+ LANGUAGES 
final_df['male-percentage-3-lang'] = final_df['Males-Numberspeakingthirdlanguage']*100/final_df['TOT_M']
final_df['female-percentage-3-lang'] = final_df['Females-Numberspeakingthirdlanguage']*100/final_df['TOT_F']
final_df['male-percentage-2-lang'] = final_df['Males-Numberspeakingsecondlanguage']*100/final_df['TOT_M']
final_df['female-percentage-2-lang'] = final_df['Females-Numberspeakingsecondlanguage']*100/final_df['TOT_F']
final_df['Males-Numberspeakingfirstlanguage'] = final_df['TOT_M'] - final_df['Males-Numberspeakingsecondlanguage'] - final_df['Males-Numberspeakingthirdlanguage']
final_df['Females-Numberspeakingfirstlanguage'] = final_df['TOT_F'] - final_df['Females-Numberspeakingsecondlanguage'] - final_df['Females-Numberspeakingthirdlanguage']
final_df['male-percentage-1-lang'] = final_df['Males-Numberspeakingfirstlanguage']*100/final_df['TOT_M']
final_df['female-percentage-1-lang'] = final_df['Females-Numberspeakingfirstlanguage']*100/final_df['TOT_F']
# MALE TO FEMALE RATIO OF LANGUAGE SPEAKERS
final_df['mtof-3-lang'] = final_df['male-percentage-3-lang'] / final_df['female-percentage-3-lang']
final_df['mtof-2-lang'] = final_df['male-percentage-2-lang'] / final_df['female-percentage-2-lang']
final_df['mtof-1-lang'] = final_df['male-percentage-1-lang'] / final_df['female-percentage-1-lang']
# MALE TO FEMALE POPULATION RATIO
final_df['mtof-populationratio'] = final_df['TOT_M']/final_df['TOT_F']

In [11]:
final_df

Unnamed: 0,state/ut,Name,Males-Numberspeakingsecondlanguage,Females-Numberspeakingsecondlanguage,Males-Numberspeakingthirdlanguage,Females-Numberspeakingthirdlanguage,TOT_M,TOT_F,male-percentage-3-lang,female-percentage-3-lang,male-percentage-2-lang,female-percentage-2-lang,Males-Numberspeakingfirstlanguage,Females-Numberspeakingfirstlanguage,male-percentage-1-lang,female-percentage-1-lang,mtof-3-lang,mtof-2-lang,mtof-1-lang,mtof-populationratio
0,0,INDIA,126159551.0,102819639.0,50536832.0,35472748.0,623270258.0,587584719.0,8.108334,6.037044,20.241548,17.498692,446573875.0,449292332.0,71.650118,76.464264,1.343097,1.156746,0.937041,1.060733
1,1,JAMMU & KASHMIR,2376063.0,1710907.0,1258756.0,837464.0,6640662.0,5900640.0,18.955279,14.192766,35.780514,28.995278,3005843.0,3352269.0,45.264207,56.811956,1.335559,1.234012,0.796737,1.125414
2,2,HIMACHAL PRADESH,483488.0,411931.0,200478.0,146808.0,3481873.0,3382729.0,5.757763,4.339928,13.88586,12.177476,2797907.0,2823990.0,80.356377,83.482596,1.326696,1.14029,0.962552,1.029309
3,3,PUNJAB,2900342.0,2304969.0,4383841.0,3446072.0,14639465.0,13103873.0,29.945363,26.298118,19.811803,17.589983,7355282.0,7352832.0,50.242833,56.111899,1.138688,1.126312,0.895404,1.117186
4,4,CHANDIGARH,144341.0,113600.0,178557.0,143422.0,580663.0,474787.0,30.750539,30.207651,24.857964,23.926519,257765.0,217765.0,44.391497,45.86583,1.017972,1.038929,0.967856,1.222997
5,5,UTTARAKHAND,873233.0,696701.0,109529.0,78293.0,5137773.0,4948519.0,2.131838,1.58215,16.996333,14.07898,4155011.0,4173525.0,80.871829,84.33887,1.347431,1.207213,0.958892,1.038245
6,6,HARYANA,2576417.0,1880373.0,681228.0,489343.0,13494734.0,11856728.0,5.048102,4.127134,19.092018,15.859122,10237089.0,9487012.0,75.85988,80.013744,1.22315,1.203851,0.948086,1.13815
7,7,NCT OF DELHI,3050945.0,2408036.0,741513.0,616720.0,8987326.0,7800615.0,8.250652,7.906043,33.947194,30.869822,5194868.0,4775859.0,57.802154,61.224134,1.043588,1.099689,0.944107,1.15213
8,8,RAJASTHAN,3747078.0,2719357.0,642842.0,363909.0,35550997.0,32997440.0,1.808225,1.10284,10.540008,8.241115,31161077.0,29914174.0,87.651767,90.656045,1.639608,1.278954,0.966861,1.077387
9,9,UTTAR PRADESH,11762916.0,8525029.0,1546993.0,1038671.0,104480510.0,95331831.0,1.480652,1.089532,11.258479,8.942479,91170601.0,85768131.0,87.260869,89.967989,1.35898,1.258989,0.96991,1.095967


In [12]:
# COMPUTING P VALUES
p_values = []
for index, row in final_df.iterrows():
    a = [row['mtof-3-lang'],row['mtof-2-lang'],row['mtof-1-lang']]
    b = [row['mtof-populationratio'],row['mtof-populationratio'],row['mtof-populationratio']]
    ttest,p_value = ttest_ind(a,b,equal_var=False)
    p_values.append(p_value)

In [13]:
p_values

[0.544581481513664,
 0.9858377677078174,
 0.3919499869099863,
 0.5051186283857573,
 0.009503334511849156,
 0.3624921883263337,
 0.8959052171329658,
 0.1138834773394853,
 0.3790585446094943,
 0.4816594643576111,
 0.5413989346446864,
 0.2465505743699638,
 0.7324624979018067,
 0.7249401648542831,
 0.6471371869216989,
 0.41740998282933517,
 0.46409781049163934,
 0.48371201414040477,
 0.6499997423670519,
 0.35541136805060103,
 0.5412649556135252,
 0.4617593114659908,
 0.3225035363516836,
 0.5020071580806412,
 0.7359347054844332,
 0.003679011843063834,
 0.14708818306138832,
 0.9550509874082701,
 0.35632267100692183,
 0.6111018679542035,
 0.5477731023537871,
 0.4665736265210172,
 0.18089982995633377,
 0.3554042208173043,
 0.28170367090822174,
 0.36586923137160576]

In [14]:
final_df['p-value']=p_values

In [15]:
f_df = final_df[['state/ut','male-percentage-3-lang','female-percentage-3-lang','male-percentage-2-lang','female-percentage-2-lang','male-percentage-1-lang','female-percentage-1-lang','p-value']]
fc_df = f_df[['state/ut','male-percentage-3-lang','female-percentage-3-lang','p-value']]
fb_df = f_df[['state/ut','male-percentage-2-lang','female-percentage-2-lang','p-value']]
fa_df = f_df[['state/ut','male-percentage-1-lang','female-percentage-1-lang','p-value']]
fc_df.rename(columns = {'male-percentage-3-lang':'male-percentage','female-percentage-3-lang':'female-percentage'}, inplace = True)
fb_df.rename(columns = {'male-percentage-2-lang':'male-percentage','female-percentage-2-lang':'female-percentage'}, inplace = True)
fa_df.rename(columns = {'male-percentage-1-lang':'male-percentage','female-percentage-1-lang':'female-percentage'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [16]:
#f_df.to_csv (r'gender-india.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE

In [17]:
fa_df.to_csv (r'gender-india-a.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE
fb_df.to_csv (r'gender-india-b.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE
fc_df.to_csv (r'gender-india-c.csv', index = None, header=True) # WRITING FINAL DATAFRAME INTO CSV FILE

In [18]:
ttest,p_value = ttest_ind(a=[1.187741,1.046072,0.958529],b=[1.141585,1.141585,1.141585],equal_var=False)

In [19]:
p_value

0.3657768428321173

In [20]:
ttest

-1.1600943440645626

In [21]:
final_df['male-percentage'][0]

KeyError: 'male-percentage'

In [None]:
from scipy.stats import ttest_rel
t_value,p_value=ttest_rel(final_df['male-percentage'][0],final_df['female-percentage'][0])

In [None]:
p_value

In [None]:
from scipy.stats import ttest_1samp
a = [final_df['male-percentage'][20],final_df['female-percentage'][20]]
t_value,p_value=ttest_1samp(a,popmean=np.mean(a))

In [None]:
np.mean(a)