NaNs, nulls, not a number - synonyms for the absence of a value

### Reading in Data with read_csv(

In [None]:
import pandas as pd

In [None]:
# a valid path is also a url

pd.read_csv('drinks.csv', usecols=['country', 'wine_servings']) 

In [None]:
alcohol = pd.read_csv('drinks.csv', usecols=['country', 'wine_servings'], index_col = 'country')

In [None]:
alcohol

In [None]:
type(alcohol)

In [None]:
alcohol = pd.read_csv('drinks.csv', usecols=['country', 'wine_servings'], index_col='country').squeeze('columns')

In [None]:
alcohol

In [None]:
type(alcohol)

### Series sizing with .size, .shape and len()

In [None]:
alcohol.size

In [None]:
alcohol.values.size

In [None]:
alcohol.index.size

In [None]:
alcohol.shape

In [None]:
alcohol.size == alcohol.shape[0]

### Unique values and series monotonicity

In [None]:
alcohol.is_unique

In [None]:
alcohol.nunique()

In [None]:
alcohol.nunique(dropna=False)

In [None]:
pd.Series(reversed([1,1,1])).is_monotonic_decreasing

### The count() method

In [None]:
alcohol.count()

In [None]:
alcohol.hasnans

In [None]:
list(alcohol.isnull())

In [None]:
alcohol[alcohol.isnull()].index

In [None]:
(alcohol.loc[alcohol.isnull()].index)

In [None]:
null_count = alcohol.loc[alcohol.isnull()]

In [None]:
len(null_count)

In [None]:
alcohol.isnull().sum()

In [None]:
sum([True, False, True]) 

In [None]:
alcohol.isnull()

In [None]:
(alcohol.loc[alcohol.isnull()].index)

In [None]:
all = alcohol.size

In [None]:
nonnulls = alcohol.count()

In [None]:
nulls = alcohol.isnull().sum()

In [None]:
all == nonnulls + nulls

### Another Approach to Nulls

In [None]:
import numpy as np

In [None]:
# ufunc -> universal function

In [None]:
np.isnan

In [None]:
ser = pd.Series(data = [True, False, None, 2], dtype=float)

In [None]:
np.isnan(ser)

In [None]:
alcohol[np.isnan].size

### The Other Side: notnull() and notna()

In [None]:
alcohol.notnull().sum()

In [None]:
alcohol[alcohol.notnull()]

In [None]:
alcohol.isnull().sum()

### Skill Challenge

In [None]:
wine_servings = alcohol[alcohol.notnull()]

In [None]:
wine_servings.size

In [None]:
wine_servings.sum()

In [None]:
less_than_100 = wine_servings[wine_servings < 100]
less_than_100.sum()
less_than_100

### Solution to Skills Challenge

isolate the not nulls in alcohol list

In [None]:
not_nulls = alcohol.loc[alcohol.notnull()]
not_nulls.head()

Total wine consumed in not_nulls

In [None]:
not_nulls.sum()

Total wine consumed by countries who consumed less than 100 servings.

In [None]:
not_nulls[not_nulls < 100].sum()

### Dropping and Filling NAs

In [None]:
display(alcohol.head())
alcohol.size

In [None]:
display(alcohol.dropna().head())
alcohol.dropna().size

In [None]:
# use the inplace parameter

# alcohol.dropna(inplace=True)

In [None]:
print(alcohol.fillna(100, inplace=False).sum())
alcohol.isna().sum()

### Descriptive statistics

In [None]:
alcohol.count()

In [None]:
alcohol.sum()


In [None]:
alcohol.sum()/alcohol.count()

In [None]:
alcohol.mean()

In [None]:
alcohol[alcohol.notnull()].count()

In [None]:
alcohol.median()

In [None]:
alcohol.mode()

In [None]:
alcohol.quantile(.5)

In [None]:
alcohol.quantile(.25)

In [None]:
alcohol.describe()

In [None]:
import matplotlib.pyplot as plt

alcohol.hist()

In [None]:
iqr = alcohol.quantile(.75) - alcohol.quantile(.25)
iqr

In [None]:
alcohol.min()

In [None]:
alcohol.max()

In [None]:
alcohol.std()

In [None]:
alcohol.var()

In [None]:
alcohol.std()**2 == alcohol.var()

### The describe() method

In [None]:
alcohol.describe()

In [None]:
alcohol.describe(percentiles=[.79, .19])

In [None]:
alcohol.value_counts().iloc[0]/alcohol[alcohol.notnull()].count()

In [None]:
alcohol.notna().size

In [None]:
alcohol.value_counts(normalize = True)

### idxmax() and idxmin()

In [None]:
alcohol.max()

In [None]:
alcohol[alcohol == alcohol.max()].index[0]

In [None]:
alcohol.idxmin()


In [None]:
alcohol.min()

In [None]:
alcohol.value_counts()

In [None]:
alcohol[alcohol == alcohol.min()].index

In [None]:
alcohol.idxmin()

In [None]:
alcohol[alcohol == alcohol.min()].value_counts()

In [None]:
alcohol.value_counts()

### Sorting with sort_values()

In [None]:
alcohol.sort_values(ascending=False, na_position = 'first').tail(28)

In [None]:
# depending on how sorted the data is originally, some sort algorithms may have better performance.

alcohol.sort_values(kind='quicksort') #mergesort, heapsort

In [None]:
alcohol.sort_values(kind='mergesort')

In [None]:
another = alcohol.copy()

In [None]:
another.sort_values(inplace = True)

In [None]:
display(alcohol.head())
another.head()

### nlargest and nsmallest

In [None]:
alcohol.sort_values(ascending=False)[:10]

In [None]:
alcohol.nlargest(10)

In [None]:
alcohol.nsmallest(29)

### Sorting with sort_index()

In [None]:
alcohol.head()

In [None]:
from asyncore import close_all


drinks = pd.read_csv('drinks.csv', close_all)

In [None]:
drinks.head()

In [None]:
alcohol = pd.read_csv('drinks.csv', usecols=['country', 'wine_servings'], index_col = 'country')

In [2]:
import pandas as pd
alcohol = pd.read_csv('drinks.csv', usecols = ['country', 'wine_servings'], index_col = 'country').squeeze('columns')
alcohol.sort_values(ascending=False, na_position = 'last', inplace = True)

ValueError: This Series is a view of some other array, to sort in-place you must create a copy

In [3]:
alcohol = alcohol.sort_values(na_position = 'last', ascending = False)

In [10]:
alcohol.sort_values(inplace = True)

In [11]:
alcohol

country
Chad          1.0
Tanzania      1.0
Ghana         1.0
Comoros       1.0
Gambia        1.0
             ... 
Sri Lanka     NaN
Sudan         NaN
Tajikistan    NaN
Uganda        NaN
Yemen         NaN
Name: wine_servings, Length: 193, dtype: float64

In [12]:
alcohol.sort_values(inplace = True, ascending = False)

In [14]:
alcohol

country
Portugal      339.0
Andorra       312.0
Denmark       278.0
Slovenia      276.0
Luxembourg    271.0
              ...  
Sri Lanka       NaN
Sudan           NaN
Tajikistan      NaN
Uganda          NaN
Yemen           NaN
Name: wine_servings, Length: 193, dtype: float64

In [23]:
fifty_plus = alcohol[alcohol > 50]

In [24]:
fifty_plus.head()

country
Portugal      339.0
Andorra       312.0
Denmark       278.0
Slovenia      276.0
Luxembourg    271.0
Name: wine_servings, dtype: float64

In [25]:
fifty_plus.size

48

In [20]:
fifty_plus.tail(20)

country
Cyprus                113.0
Spain                 112.0
Finland                97.0
Bulgaria               94.0
Macedonia              86.0
USA                    84.0
South Africa           81.0
Iceland                78.0
Paraguay               74.0
Cook Islands           74.0
Russian Federation     73.0
St. Lucia              71.0
Latvia                 62.0
Gabon                  59.0
Estonia                59.0
Lithuania              56.0
Poland                 56.0
Albania                54.0
Bahamas                51.0
Seychelles             51.0
Name: wine_servings, dtype: float64

In [26]:
fifty_plus.iloc[-20:].describe()

count     20.000000
mean      74.250000
std       19.072921
min       51.000000
25%       58.250000
50%       73.500000
75%       84.500000
max      113.000000
Name: wine_servings, dtype: float64

In [31]:
fifty_plus.nsmallest(20).describe(percentiles=[.25, .6, .75])

count     20.000000
mean      74.250000
std       19.072921
min       51.000000
25%       58.250000
50%       73.500000
60%       75.600000
75%       84.500000
max      113.000000
Name: wine_servings, dtype: float64

### Series Arithmetics and fill_value()