## Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/survey_results_public.csv", index_col="Respondent")
schema_df = pd.read_csv("data/survey_results_schema.csv", index_col="Column")

In [3]:
df.rename(columns={"ConvertedComp":"Salary"}, inplace=True)

## Missing Value

In [18]:
# "Missing", "NA" is a custom missing values

people = {
    'first': ['Adam', 'Jack', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Walker', 'Doe', 'Doe', 'Walker', np.nan, np.nan, 'Missing'], 
    'email': ['AdamWalker@gmail.com', 'JackDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [19]:
df = pd.DataFrame(people)

In [20]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [6]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


### Dropping Missing Values

In [10]:
df.dropna(how="all")

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [11]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [15]:
df.dropna() # df.dropna(axis="index", how="any")

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [17]:
df.dropna(axis="columns")

0
1
2
3
4
5
6


In [18]:
type(df.dropna(axis="columns"))

pandas.core.frame.DataFrame

 ### Specifically Checking Missing Value of Columns

In [19]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [20]:
df.dropna(axis="index", how="any", subset=["age"])

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
6,,Missing,,Missing


In [21]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [22]:
df.dropna(axis="index", how="any", subset=["age", "email"])

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [24]:
df.dropna(axis="index", how="all", subset=["age", "email"])

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


### Handling Custom Missing Values

#### First Way

In [25]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [25]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [26]:
df.replace("NA", np.nan)

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [27]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [28]:
df.replace("NA", np.nan, inplace=True)
df.replace("Missing", np.nan, inplace=True)

In [29]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33.0
1,Jack,Doe,JackDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Walker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [30]:
df.dropna() # default olarak indexlerde 1 tane bile boşsa atar

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


#### Getting Boolean Mask

In [31]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [32]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33.0
1,Jack,Doe,JackDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Walker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [33]:
df["first"]

0     Adam
1     Jack
2     John
3    Chris
4      NaN
5     None
6      NaN
Name: first, dtype: object

In [34]:
df["first"].isna()

0    False
1    False
2    False
3    False
4     True
5     True
6     True
Name: first, dtype: bool

In [35]:
type(df.isna())

pandas.core.frame.DataFrame

#### Second Way

In [None]:
# When loading csv

In [36]:
na_values = ["NA", "Missing"]
df_survey = pd.read_csv("data/survey_results_public.csv", index_col="Respondent", na_values=na_values)

In [37]:
df_survey.head()

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


### fillna()

In [38]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33.0
1,Jack,Doe,JackDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Walker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [39]:
df.fillna("Missing")

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33
1,Jack,Doe,JackDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Walker,Missing,36
4,Missing,Missing,Missing,Missing
5,Missing,Missing,Anonymous@email.com,Missing
6,Missing,Missing,Missing,Missing


In [40]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33.0
1,Jack,Doe,JackDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Walker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [41]:
# it does not treat Missing as NA anymore
(df.fillna("Missing")).isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False


## 11) Casting Datatypes

In [None]:
people = {
    'first': ['Adam', 'Jack', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Walker', 'Doe', 'Doe', 'Walker', np.nan, np.nan, 'Missing'], 
    'email': ['AdamWalker@gmail.com', 'JackDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [42]:
df

Unnamed: 0,first,last,email,age
0,Adam,Walker,AdamWalker@gmail.com,33.0
1,Jack,Doe,JackDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Walker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [43]:
df.replace("NA", np.nan, inplace=True)
df.replace("Missing", np.nan, inplace=True)

In [44]:
df["age"].mean()

TypeError: can only concatenate str (not "int") to str

In [46]:
type(df["age"].loc[0])

str

In [49]:
# age sütunu içerisinde string değerleri var, şu anda ortalamasını alamam

**nan değerleri float olarak algılanıyor, o yüzden hepsini interger a dönüştürmek istesek uyarı verecek**

In [51]:
type(np.nan)

float

In [52]:
df["age"] = df["age"].astype(int)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [53]:
df["age"] = df["age"].astype(float)

In [54]:
df["age"]

0    33.0
1    55.0
2    63.0
3    36.0
4     NaN
5     NaN
6     NaN
Name: age, dtype: float64

In [55]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [59]:
df["age"].mean()

46.75

In [60]:
(33 + 55 + 63 + 36) / 4

46.75

### Özgün Değerlere Bakmak

In [66]:
df = pd.read_csv("data/survey_results_public.csv", index_col="Respondent", na_values=na_values)

In [62]:
df

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88377,,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,"Not employed, and not looking for work",Canada,No,Primary/elementary school,,"Taught yourself a new language, framework, or ...",...,,Tech articles written by other developers;Tech...,,Man,No,,,No,Appropriate in length,Easy
88601,,No,Never,The quality of OSS and closed source software ...,,,,,,,...,,,,,,,,,,
88802,,No,Never,,Employed full-time,,,,,,...,,,,,,,,,,
88816,,No,Never,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",,,,,,...,,,,,,,,,,


In [67]:
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [68]:
df["YearsCode"].replace("Less than 1 year", 0, inplace=True)

In [69]:
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37',
       'More than 50 years', '29', '44', '45', '48', '46', '43', '47',
       '49'], dtype=object)

In [70]:
df["YearsCode"].replace("More than 50 years", 51, inplace=True)

In [71]:
df["YearsCode"].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 0, '30', '9', '26', '40', '19', '15', '20', '28',
       '25', '1', '22', '11', '33', '50', '41', '18', '34', '24', '23',
       '42', '27', '21', '36', '32', '39', '38', '31', '37', 51, '29',
       '44', '45', '48', '46', '43', '47', '49'], dtype=object)

In [72]:
df["YearsCode"] = df["YearsCode"].astype(float)

In [73]:
df["YearsCode"]

Respondent
1         4.0
2         NaN
3         3.0
4         3.0
5        16.0
         ... 
88377     NaN
88601     NaN
88802     NaN
88816     NaN
88863     8.0
Name: YearsCode, Length: 88883, dtype: float64

In [74]:
df["YearsCode"].unique()

array([ 4., nan,  3., 16., 13.,  6.,  8., 12.,  2.,  5., 17., 10., 14.,
       35.,  7.,  0., 30.,  9., 26., 40., 19., 15., 20., 28., 25.,  1.,
       22., 11., 33., 50., 41., 18., 34., 24., 23., 42., 27., 21., 36.,
       32., 39., 38., 31., 37., 51., 29., 44., 45., 48., 46., 43., 47.,
       49.])

In [75]:
df["YearsCode"].mean()

11.662114216834588

In [76]:
df["YearsCode"].median()

9.0