In [13]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("survey_results_public.csv", index_col="Respondent")
schema_df = pd.read_csv("survey_results_schema.csv", index_col="Column")

## handling missing data: dropna
* ### does remove rows in which datatype numpy.nan or None are included,<br>custom missing data like "NA" or "Missing" as strings will not be removed<br><br>  

* ### default: axis="index", how="any"
* ### axis: 0: rows(or index) & 1: columns, which tells what to iterate the opposite is constant,<br>e.g. if axis="rows" then checks the rows and drops them or axis="columns" then checks columns values 
* ### how="any" if there is a missing value remove it, how="all" if every element is missing<br><br>

* ### subset: specify which column to check, only works with axis="rows"

In [12]:
df.dropna(axis="rows", how="any", subset=["ConvertedComp"])  

Unnamed: 0_level_0,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,EduOther,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
Respondent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,"Taught yourself a new language, framework, or ...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
6,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Canada,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Mathematics or statistics,Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,28.0,Man,No,Straight / Heterosexual,East Asian,No,Too long,Neither easy nor difficult
9,I am a developer by profession,Yes,Once a month or more often,The quality of OSS and closed source software ...,Employed full-time,New Zealand,No,Some college/university study without earning ...,"Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,,23.0,Man,No,Bisexual,White or of European descent,No,Appropriate in length,Neither easy nor difficult
10,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,India,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)",,,...,Somewhat less welcome now than last year,Tech articles written by other developers;Tech...,,,,,,Yes,Too long,Difficult
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88878,I am a developer by profession,Yes,Less than once per year,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,26.0,Man,No,Straight / Heterosexual,South Asian,No,Appropriate in length,Easy
88879,I am a developer by profession,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Finland,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...","Taught yourself a new language, framework, or ...",...,Not applicable - I did not use Stack Overflow ...,,34.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
88881,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Austria,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",Taken an online course in programming or softw...,...,,,37.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
88882,I am a developer by profession,Yes,Never,"OSS is, on average, of LOWER quality than prop...",Employed full-time,Netherlands,"Yes, full-time","Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",Participated in online coding competitions (e....,...,Just as welcome now as I felt last year,,,Man,No,Straight / Heterosexual,White or of European descent,Yes,Too long,Easy


* ### if both ConvertedComp and Country missing than remove

In [None]:
df.dropna(axis="index", how="all", subset=["ConvertedComp", "Country"])

## handling custom missing values

* ### replace: when not loading from a csv file. By replacing with np.nan dropna can remove it

In [14]:
df.replace("NA", np.nan, inplace=True)
df.replace("Missing", np.nan, inplace=True)

* ### fillna: replaces all NaN with given data (here: Missing) 
* ### method most useful when dealing with numerical data

In [None]:
df.fillna("MISSING") 

## casting datatypes

In [15]:
df.dtypes

MainBranch      object
Hobbyist        object
OpenSourcer     object
OpenSource      object
Employment      object
                 ...  
Sexuality       object
Ethnicity       object
Dependents      object
SurveyLength    object
SurveyEase      object
Length: 84, dtype: object

* ### dtype of np.nan is float, this is important because convert it to int does not work

In [17]:
type(np.nan)

float

* ### convert dtype: astype

In [22]:
df["ConvertedComp"] = df["ConvertedComp"].astype(float)
df["ConvertedComp"].dtype

dtype('float64')

In [23]:
df["ConvertedComp"].mean()

127110.73842323056

* ### convert dtype of whole dataframe at once possible

In [None]:
# df.astype(float)

## handling missing values in csv
* ### will treat those values "Missing" & "NA" as missing values and gives np.nan dtype

In [24]:
df = pd.read_csv("survey_results_public.csv", index_col="Respondent", na_values=["Missing", "NA"])

## get average experience

In [26]:
df.dtypes["YearsCode"]  # object, so string in column

dtype('O')

In [27]:
df["YearsCode"].unique()  # yes string in column

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [28]:
# replace strings
df["YearsCode"].replace("Less than 1 year", 0, inplace=True)
df["YearsCode"].replace("More than 50 years", 51, inplace=True)

In [30]:
df["YearsCode"] = df["YearsCode"].astype(float)
df["YearsCode"].median()

9.0