In [1]:
import pandas as pd
import numpy as np

In [2]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [3]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
   ....:                      [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [7]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [8]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
data.dropna(how="all")

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
 df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [12]:
df.dropna(thresh=2)


Unnamed: 0,0,1,2
2,-1.803836,,-1.146481
3,1.835897,,0.158864
4,-0.328528,0.105164,0.853355
5,0.400667,1.383225,-2.268463
6,1.523367,-0.97377,-0.767559


In [13]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.684228,0.0,0.0
1,-0.26151,0.0,0.0
2,-1.803836,0.0,-1.146481
3,1.835897,0.0,0.158864
4,-0.328528,0.105164,0.853355
5,0.400667,1.383225,-2.268463
6,1.523367,-0.97377,-0.767559


In [14]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.684228,0.5,0.0
1,-0.26151,0.5,0.0
2,-1.803836,0.5,-1.146481
3,1.835897,0.5,0.158864
4,-0.328528,0.105164,0.853355
5,0.400667,1.383225,-2.268463
6,1.523367,-0.97377,-0.767559


In [15]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
   ....:                      "k2": [1, 1, 2, 3, 3, 4, 4]})

In [16]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [17]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [18]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [19]:
data["v1"] = range(7)

In [20]:
data.drop_duplicates(subset=["k1"])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [21]:
data.drop_duplicates(subset=["k1"], keep="last")

Unnamed: 0,k1,k2,v1
4,one,3,4
6,two,4,6


In [22]:
data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",
   ....:                               "pastrami", "corned beef", "bacon",
   ....:                               "pastrami", "honey ham", "nova lox"],
   ....:                      "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [23]:
meat_to_animal = {
  "bacon": "pig",
  "pulled pork": "pig",
  "pastrami": "cow",
  "corned beef": "cow",
  "honey ham": "pig",
  "nova lox": "salmon"
}

In [25]:
data["animal"] = data["food"].map(meat_to_animal)

In [26]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])


In [27]:
 data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [28]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [30]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
   ....:                     index=["Ohio", "Colorado", "New York"],
   ....:                     columns=["one", "two", "three", "four"])



In [31]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [32]:
def transform(x):
    return x[:4].upper()

In [33]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [34]:
data.index = data.index.map(transform)


In [35]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [37]:
 ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

In [38]:
age_categories = pd.cut(ages, bins)

In [39]:
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [40]:
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]

In [41]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [42]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [48]:
data[(data.abs() > 3).any(axis="columns")]

Unnamed: 0,0,1,2,3
65,-0.072295,1.483792,-3.044466,0.016094
142,0.184024,-0.634943,0.127234,-3.615433
149,3.121877,-2.207559,1.270296,0.696175
302,0.353396,3.055054,0.415338,-0.575002
360,-2.514845,-1.249517,3.672983,1.267546
361,-3.062853,-2.020215,1.271149,3.386879
455,-0.742714,-1.4285,-1.522656,3.212133
486,0.110182,0.134639,0.356618,3.029134
659,0.918136,0.268137,3.103625,0.182074
822,-0.480033,2.306522,0.892552,3.515187


In [49]:
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))

In [50]:
sampler = np.random.permutation(5)

In [51]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34


In [52]:
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34


In [53]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
   .....:                    "data1": range(6)})

In [54]:
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [55]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())

In [56]:
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [57]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com",
   .....:         "Rob": "rob@gmail.com", "Wes": np.nan}

In [59]:
data = pd.Series(data)


In [60]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [61]:
data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [62]:
data_as_string_ext = data.astype('string')

In [63]:
data_as_string_ext.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

In [64]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

In [66]:
import re
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [67]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]
matches

Dave     (dave, google, com)
Steve    (steve, gmail, com)
Rob        (rob, gmail, com)
Wes                      NaN
dtype: object

In [68]:
data.str.extract(pattern, flags=re.IGNORECASE)

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


In [69]:
values = pd.Series(['apple', 'orange', 'apple',
   .....:                     'apple'] * 2)

In [70]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [71]:
pd.value_counts(values)

apple     6
orange    2
Name: count, dtype: int64

In [72]:
values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [74]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)
df = pd.DataFrame({'fruit': fruits,
'basket_id': np.arange(N),
'count': rng.integers(3, 15, size=N),
'weight': rng.uniform(0, 4, size=N)},
columns=['basket_id', 'fruit', 'count', 'weight'])
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [75]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [77]:
c = fruit_cat.array
dict(enumerate(c.categories))

{0: 'apple', 1: 'orange'}

In [79]:
categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]
my_cats_2 = pd.Categorical.from_codes(codes, categories)
my_cats_2

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']