In [1]:
import pandas as pd
import numpy as np

# Practice

## Question 1

Create a pandas dataframe that contains the following columns:
```
name = ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas']
score = [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19]
attempts = [1, 3, 2, 3, 2, 3, 1, 1, 2, 1]
qualify =  ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']
```
and has as index the characters `a,b,c,` etc.


Write a function `print_cols(df)` that prints each column obtaining something like:
```
a    Anastasia
b         Dima
...
j        Jonas
Name: name, dtype: object
________________________________________________________________________________
a    12.5
b     9.0
...
j    19.0
Name: score, dtype: float64
________________________________________________________________________________
```

In [55]:
# data 

name = ['Anastasia', 'Dima', 'Katherine', 'James', 'Emily', 'Michael', 'Matthew', 'Laura', 'Kevin', 'Jonas']
score = [12.5, 9, 16.5, np.nan, 9, 20, 14.5, np.nan, 8, 19]
attempts = [1, 3, 2, 3, 2, 3, 1, 1, 2, 1]
qualify =  ['yes', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'yes']

df = pd.DataFrame([name, score, attempts, qualify], index=['name', 'score', 'attempts', 'qualify'], columns=list('abcdefghij'))
df = df.T

def print_cols(df):
    for i in df.columns:
        print(df[i])
        print(80*"-")

## Test question 1

Your code should not raise any error.

In [56]:
assert np.all(df.columns == ['name', 'score', 'attempts', 'qualify']), 'your DataFrame does not have the right columns'
assert np.all(df.index == ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']), 'your DataFrame does not have the right index'

from contextlib import redirect_stdout
def redirect(func):
    def redirected_func(*args, **kw):
        path = 'test.txt'
        with open(path,'w') as out:
            with redirect_stdout(out): 
                func(*args, **kw)
        with open('test.txt') as f:
            txt = [line.strip() for line in f]
        return txt
    return redirected_func
r_print_cols = redirect(print_cols)
txt = r_print_cols(df)

assert txt[0] == 'a    Anastasia' and txt[10] == 'Name: name, dtype: object' and txt[-3] == 'j    yes', 'the print_cols did not print what was expected'

## Question 2

Find what is the function in pandas to test the presence of a NaN entry in a Series or in a DataFrame.

Write the function `has_nan(df, col_name)` that returns True if, in the dataframe `df`, the column with label `col_name` contains at least one entry that is a NaN.

Write the function `get_columns_with_missing_values(df)` that returns the labels of those columns that contain missing or NaN values.  

In [28]:

def has_nan(df, col_name):
    return df[col_name].isnull().values.any()
    
def get_columns_with_missing_values(df):
    labels = []

    for i in range(len(df.columns)):
        if has_nan(df, df.columns[i]):
            labels.append(df.columns[i])
    return labels

## Test question 2

Your code should not raise any error.

In [52]:
assert not has_nan(df, 'name') , 'when tested on this column the result shoud be False'
assert has_nan(df, 'score') , 'when tested on this column the result shoud be True'
assert get_columns_with_missing_values(df) == ['score'], 'Only "score" should be returned'

## Question 3

Write a function `replace_missing(df)` that replaces the NaN values with the **median value** for the column. 

The functions should return a copy of the original dataframe and should not alter the dataframe in input.

In [14]:
def replace_missing(df):
    # YOUR CODE HERE
    raise NotImplementedError()

## Test question 3

Your code should not raise any error.

In [22]:
df2 = replace_missing(df)
assert df2 is not None and isinstance(df2, pd.DataFrame), 'the function replace_missing should return a dataframe'
assert np.all(df2.loc[['d','h'],'score'].values == [14.5, 14.5]), "the score for James and Laura should be 14.5"

In [23]:
#efficiency test

def replace_missing_slow(df):
    df2 = df.copy()
    missing_values_cols = get_columns_with_missing_values(df)
    
    for col in missing_values_cols:
        s = df[col].copy()
        # extract all non null values 
        vals = s[s.isna()==False].values

        # find the median of the non null values
        n = vals.shape[0]
        median_index = int(n/2)
        sorted_vals = np.sort(vals)
        median_val = sorted_vals[median_index]
        
        # overwrite the NaNs with the median_val
        # slow version
        for i in s.index:
            if pd.isna(s[i]):
                s[i] = median_val
        
        #update the column in the dataframe
        df2[col] = s

    return df2

from time import process_time_ns
def decorate_runtime(func):
    def timed_func(*args, **kw):
        start = process_time_ns()
        out = func(*args, **kw)
        end = process_time_ns()
        elapsed = end - start
        return out, elapsed
    return timed_func

d = np.arange(1e6)
d[d%2==0]=np.nan
df3 = pd.DataFrame(d)

t_replace_missing = decorate_runtime(replace_missing)
df2, t_current = t_replace_missing(df3)

t_replace_missing_slow = decorate_runtime(replace_missing_slow)
df2, t_slow = t_replace_missing_slow(df3)
assert t_current/t_slow < 0.01, 'Your code is not efficient, you should use vectorized operations'