In [33]:
import numpy as np
import pandas as pd
import scipy.stats
import rpy2.robjects.numpy2ri
from rpy2.robjects.packages import importr

In [34]:
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 2000)

table = pd.read_csv("2016-FCC-New-Coders-Survey-Data.csv", sep=",", 
                    parse_dates = ['Part1EndTime','Part1StartTime','Part2EndTime','Part2StartTime'],
                    dtype = {'CodeEventOther': str,
                             'JobRoleInterestOther': str})
pd.to_numeric(table['Age'], errors = 'coerce')
pd.to_numeric(table['Income'], errors = 'coerce')

0        32000.0
1        15000.0
2        48000.0
3        43000.0
4         6000.0
          ...   
15615    10000.0
15616    10000.0
15617    48000.0
15618    10000.0
15619    18000.0
Name: Income, Length: 15620, dtype: float64

In [35]:
table = table.loc[:, ['EmploymentField', 'EmploymentStatus', 'Gender', 'JobPref', 'JobWherePref', 'MaritalStatus', 'Income']]
table

Unnamed: 0,EmploymentField,EmploymentStatus,Gender,JobPref,JobWherePref,MaritalStatus,Income
0,office and administrative support,Employed for wages,male,freelance,,married or domestic partnership,32000.0
1,food and beverage,Employed for wages,male,work for a startup,in an office with other developers,,15000.0
2,finance,Employed for wages,male,start your own business,,,48000.0
3,"arts, entertainment, sports, or media",Employed for wages,female,work for a startup,from home,,43000.0
4,education,Employed for wages,female,work for a medium-sized company,in an office with other developers,,6000.0
...,...,...,...,...,...,...,...
15615,software development and IT,Self-employed freelancer,male,start your own business,,"single, never married",10000.0
15616,software development and IT,Employed for wages,male,work for a startup,in an office with other developers,"single, never married",10000.0
15617,,Self-employed freelancer,male,freelance,,,48000.0
15618,education,Self-employed freelancer,male,,,,10000.0


In [7]:
table2 = table.dropna()
table2 = table2[(table2['Gender'] == 'male') | (table2['Gender'] == 'female')]
table2

Unnamed: 0,EmploymentField,EmploymentStatus,Gender,JobPref,JobWherePref,MaritalStatus,Income
59,software development,Employed for wages,male,work for a medium-sized company,in an office with other developers,married or domestic partnership,35000.0
71,education,Employed for wages,male,work for a multinational corporation,from home,married or domestic partnership,56000.0
72,transportation,Employed for wages,male,work for a medium-sized company,from home,married or domestic partnership,35000.0
77,"arts, entertainment, sports, or media",Employed for wages,male,work for a medium-sized company,from home,married or domestic partnership,65000.0
90,sales,Employed for wages,male,work for a startup,in an office with other developers,"single, never married",30000.0
...,...,...,...,...,...,...,...
15564,finance,Employed for wages,male,work for a medium-sized company,in an office with other developers,married or domestic partnership,200000.0
15566,finance,Employed for wages,male,work for a startup,no preference,married or domestic partnership,200000.0
15584,software development and IT,Employed for wages,male,work for a multinational corporation,in an office with other developers,married or domestic partnership,200000.0
15598,food and beverage,Employed for wages,male,work for a medium-sized company,from home,married or domestic partnership,200000.0


# A

In [8]:
result = pd.crosstab(table2.Gender, table2.JobPref, margins=True)
result

JobPref,work for a medium-sized company,work for a multinational corporation,work for a startup,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,82,19,26,127
male,362,103,172,637
All,444,122,198,764


In [9]:
result = pd.crosstab(table2.Gender, table2.JobPref, margins=False)
expected = scipy.stats.contingency.expected_freq(result)
expected

array([[ 73.80628272,  20.28010471,  32.91361257],
       [370.19371728, 101.71989529, 165.08638743]])

In [10]:
print(scipy.stats.chi2_contingency(result, correction=False))

(2.9296663743177596, 0.2311165414688363, 2, array([[ 73.80628272,  20.28010471,  32.91361257],
       [370.19371728, 101.71989529, 165.08638743]]))


# B

In [11]:
result = pd.crosstab(table2.Gender, table2.JobWherePref, margins=True)
result

JobWherePref,from home,in an office with other developers,no preference,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,38,57,32,127
male,149,317,171,637
All,187,374,203,764


In [12]:
result = pd.crosstab(table2.Gender, table2.JobWherePref, margins=False)
expected = scipy.stats.contingency.expected_freq(result)
expected

array([[ 31.08507853,  62.17015707,  33.7447644 ],
       [155.91492147, 311.82984293, 169.2552356 ]])

In [13]:
print(scipy.stats.chi2_contingency(result, correction=False))

(2.468792878230615, 0.29101035183846335, 2, array([[ 31.08507853,  62.17015707,  33.7447644 ],
       [155.91492147, 311.82984293, 169.2552356 ]]))


# C

In [14]:
result = pd.crosstab(table2.JobWherePref, table2.MaritalStatus, margins=True)
result

MaritalStatus,divorced,married or domestic partnership,separated,"single, never married",All
JobWherePref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
from home,12,153,2,20,187
in an office with other developers,14,291,2,67,374
no preference,11,149,4,39,203
All,37,593,8,126,764


In [15]:
result = pd.crosstab(table2.JobWherePref, table2.MaritalStatus, margins=False)
expected = scipy.stats.contingency.expected_freq(result)
expected

array([[  9.05628272, 145.14528796,   1.95811518,  30.84031414],
       [ 18.11256545, 290.29057592,   3.91623037,  61.68062827],
       [  9.83115183, 157.56413613,   2.12565445,  33.47905759]])

In [16]:
rpy2.robjects.numpy2ri.activate()
stats= importr('stats')
m = np.array(result.values)
res = stats.fisher_test(m)
print(res)



	Fisher's Exact Test for Count Data



data:  

p-value = 0.06912

alternative hypothesis: two.sided





# D

In [17]:
result = pd.crosstab(table2.EmploymentField, table2.JobWherePref, margins=True)
result

JobWherePref,from home,in an office with other developers,no preference,All
EmploymentField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
architecture or physical engineering,8,15,4,27
"arts, entertainment, sports, or media",17,25,7,49
construction and extraction,1,11,4,16
education,29,46,26,101
"farming, fishing, and forestry",1,1,1,3
finance,8,27,8,43
food and beverage,9,13,10,32
health care,10,18,14,42
law enforcement and fire and rescue,1,4,2,7
legal,2,3,2,7


In [18]:
result = pd.crosstab(table2.EmploymentField, table2.JobWherePref, margins=False)
expected = scipy.stats.contingency.expected_freq(result)
expected

array([[  6.60863874,  13.21727749,   7.17408377],
       [ 11.9934555 ,  23.98691099,  13.01963351],
       [  3.91623037,   7.83246073,   4.2513089 ],
       [ 24.72120419,  49.44240838,  26.83638743],
       [  0.73429319,   1.46858639,   0.79712042],
       [ 10.52486911,  21.04973822,  11.42539267],
       [  7.83246073,  15.66492147,   8.5026178 ],
       [ 10.28010471,  20.56020942,  11.15968586],
       [  1.71335079,   3.42670157,   1.85994764],
       [  1.71335079,   3.42670157,   1.85994764],
       [ 17.13350785,  34.26701571,  18.59947644],
       [ 11.9934555 ,  23.98691099,  13.01963351],
       [  0.97905759,   1.95811518,   1.06282723],
       [ 71.96073298, 143.92146597,  78.11780105],
       [  4.89528796,   9.79057592,   5.31413613]])

In [19]:
rpy2.robjects.numpy2ri.activate()
stats= importr('stats')
m = np.array(result.values)
res = stats.fisher_test(m, simulate_p_value=True)
print(res)



	Fisher's Exact Test for Count Data with simulated p-value (based on

	2000 replicates)



data:  

p-value = 0.5897

alternative hypothesis: two.sided





# E

In [20]:
result = pd.crosstab(table2.EmploymentStatus, table2.JobWherePref, margins=True)
result

JobWherePref,from home,in an office with other developers,no preference,All
EmploymentStatus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Employed for wages,164,330,187,681
Self-employed business owner,5,10,4,19
Self-employed freelancer,18,34,12,64
All,187,374,203,764


In [21]:
result = pd.crosstab(table2.EmploymentStatus, table2.JobWherePref, margins=False)
expected = scipy.stats.contingency.expected_freq(result)
expected

array([[166.68455497, 333.36910995, 180.94633508],
       [  4.65052356,   9.30104712,   5.04842932],
       [ 15.66492147,  31.32984293,  17.0052356 ]])

In [22]:
rpy2.robjects.numpy2ri.activate()
stats= importr('stats')
m = np.array(result.values)
res = stats.fisher_test(m)
print(res)



	Fisher's Exact Test for Count Data



data:  

p-value = 0.6163

alternative hypothesis: two.sided





# Работа с INCOME

In [23]:
income_array = table2.Income.to_numpy(dtype=np.float64)
nomin_income = []
for x in income_array:
    if x<40000.0:
        nomin_income.append('низкий')
    elif x>40000.0 and x<100000.0:
        nomin_income.append('средний')
    else:
        nomin_income.append('высокий')
# income_array = np.array(nomin_income)
table3 = table2.loc[:, ['EmploymentField', 'EmploymentStatus', 'Gender', 'JobPref', 'JobWherePref', 'MaritalStatus']]
table3['Income']=nomin_income

In [24]:
result = pd.crosstab(table3.Gender, table3.Income, margins=True)
result

Income,высокий,низкий,средний,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,15,50,62,127
male,78,284,275,637
All,93,334,337,764


In [25]:
result = pd.crosstab(table3.Gender, table3.Income, margins=False)
expected = scipy.stats.contingency.expected_freq(result)
expected

array([[ 15.45942408,  55.52094241,  56.01963351],
       [ 77.54057592, 278.47905759, 280.98036649]])

In [26]:
print(scipy.stats.chi2_contingency(result, correction=True))

(1.4405450772185862, 0.48661961525274, 2, array([[ 15.45942408,  55.52094241,  56.01963351],
       [ 77.54057592, 278.47905759, 280.98036649]]))
