In [21]:
# Pandas Web User Guide Follow along
# Source: https://pandas.pydata.org/docs/user_guide/10min.html
# Created 11/24/20

%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

# How interactive you want is discussed:
# https://ipython.readthedocs.io/en/stable/config/options/terminal.html
# Options are: 'all', 'last', 'last_expr', 'none', 'last_expr_or_assign'
# Default is: 'last_expr'

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity I can only get last_expr_or_assign to work
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"


In [2]:
def diag(*args):
    """Pandas diagnostics"""
    
    for i in args:
        
        if isinstance(i, pd.core.frame.DataFrame):
            print(i.info())
            display(i)
        else:
            print(f'{"-"*40}')
            print(f'Type: {type(i)}')

            try:
                print(f'Length: {len(i)}')
            except:
                pass

            try:
                print(i.info())
            except:
                pass

            try:
                display(i)
            except:
                print(i)
                
z = diag
d = display

<function IPython.core.display.display(*objs, include=None, exclude=None, metadata=None, transient=None, display_id=None, **kwargs)>

In [3]:
def read_df(text):
    """Create a pandas dataframe from a string of a dataframe
    copied from the pandas website tutorial."""
    lines = text.split('\n')
    cols = lines[0].split()
    index, array = [], []
    for line in lines[1:]:
        vals = line.split()
        index.append(vals[0])
        array.append(vals[1:])
#     print(cols)
#     print(index)
#     print(array)
    df = pd.DataFrame(array, index=index, columns=cols)
    return df

In [None]:
dates = pd.date_range('20130101', periods=6)

In [None]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [None]:
z(df)

In [None]:
a = df.index.values
z(a)

In [None]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

In [None]:
z(df2)

In [None]:
df3 = df2.copy()
rename_dict = {'A': '1', 'B':'2', 'C':'3', 'D':'4', 'E':'5', 'F':'6'}
print(rename_dict)
df3 = df2.rename(rename_dict, axis=1)
z(df3)

In [None]:
df.sort_index(axis=1, ascending=True, inplace=True)

In [None]:
df

In [None]:
df.sort_values(by='B')

In [None]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [None]:
z(dates)

In [None]:
df.loc[dates[0]]

In [None]:
a = df.loc['20130102', ['A', 'B']]
z(a)

In [None]:
df[df > 0]

In [None]:
a = df > 0
z(a)

In [None]:
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
z(df2)
a = df2['E'].isin(['two', 'four'])
z(a)

In [None]:
b = df2[a]
z(b)

In [None]:
z(df)

In [None]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6))
z(s1)

In [None]:
df['F'] = s1
z(df)

In [None]:
df2 = df.reindex(pd.date_range('20130101', periods=10))
z(df2)

In [None]:
df2['F'] = s1
z(df2)

In [None]:
z(df)

In [None]:
new_vals = np.arange(len(df))
new_vals_r = new_vals[::-1]
z(new_vals_r)

In [None]:
df.loc[:, 'D'] = new_vals_r

In [None]:
z(df)

In [None]:
df2 = df.copy()
df2[df2<0] = 10* df2
df2[df2>0] = 69

z(df2)

In [None]:
df.mean()

In [None]:
df.mean(axis=1)

In [None]:
z(dates)

In [None]:
s1 = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates)
s2 = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)

In [None]:
z(s1)
z(s2)

In [None]:
df2 = df.sub(s2, axis=0)
z(df2)

In [None]:
z(df)

In [None]:
df.apply(lambda x: x - x.mean())

In [None]:
df.apply(lambda x: x.max() - x.min())

In [None]:
s = pd.Series(np.random.randint(0, 7, size=100))
z(s)

In [None]:
a = s.value_counts()
z(a)

In [None]:
b = pd.DataFrame(a)
z(b)

In [None]:
c = b.index.sort_values()
c

In [None]:
d = b.reindex(index=c)

In [None]:
d

In [None]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
sc = s.str.lower()
sc

In [None]:
c = sc[5]
print(type(c))
print(repr(c))
dir(c)
display(c)

In [None]:
c == np.nan

In [None]:
np.nan

In [None]:
c

In [None]:
c is np.nan

In [None]:
type(np.nan)

In [None]:
dir(np.nan)

In [None]:
df = pd.DataFrame(np.random.randn(10, 4))
z(df)

In [None]:
df.index

In [None]:
df.columns

In [None]:
pieces = [df[:3], df[3:7], df[7:]]

In [None]:
type(pieces)

In [None]:
len(pieces)

In [None]:
for i in pieces:
    print(type(i))
    display(i)

In [None]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
z(left)
z(right)

In [None]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

In [None]:
z(df)

In [None]:
a = df.groupby('A').sum()
z(a)

In [None]:
b = df.groupby(['A', 'B']).sum()
z(b)

In [None]:
c=b.index
z(c)

In [None]:
a = [['bar', 'bar', 'baz', 'baz',
      'foo', 'foo', 'qux', 'qux'],
     ['one', 'two', 'one', 'two',
      'one', 'two', 'one', 'two']]
z(a)

In [None]:
b = zip(*a)

In [None]:
b

In [None]:
for i in b:
    print(i)

In [None]:
tuples  = list(zip(*a))
z(tuples)

In [None]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [None]:
z(index)

In [None]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

In [None]:
z(df)

In [None]:
stacked = df.stack()
z(stacked)

In [None]:
stacked.index

In [None]:
a=stacked.unstack(0)
z(a)

In [None]:
a=stacked.unstack(1)
z(a)

In [None]:
a=stacked.unstack(2)
z(a)

In [None]:
a=stacked.unstack([0, 1])
z(a)

In [None]:
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['X', 'Y', 'Z'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12)})

In [None]:
z(df)

In [None]:
pt = pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'], aggfunc=np.sum)
display(pt)

In [None]:
rng = pd.date_range('1/1/2012', periods=1000, freq='S')
z(rng)

In [None]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
z(ts)

In [None]:
ts2 = ts.resample('5Min')
z(ts2)

In [None]:
vars(ts2)

In [None]:
ts3 = ts2.sum()
z(ts3)

In [None]:
ts.index.min()

In [None]:
ts.index.max()

In [None]:
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)
z(ts)

In [None]:
ts_utc = ts.tz_localize('UTC')
ts_utc

In [None]:
ts_utc.tz_convert('US/Eastern')

In [None]:
rng = pd.date_range('1/1/2012', periods=5, freq='M')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
z(ts)

In [None]:
ps = ts.to_period()
z(ps)

In [None]:
ps.to_timestamp()

In [None]:
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
z(prng)

In [None]:
ts = pd.Series(np.random.randn(len(prng)), prng)
z(ts)

In [None]:
ts1 = prng.asfreq('M', 'e') + 1
z(ts1)

In [None]:
ts2 = ts1.asfreq('H', 's') + 9
z(ts2)

In [None]:
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                   "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
z(df)

In [None]:
df["grade"] = df["raw_grade"].astype("category")

In [None]:
z(df)

In [None]:
z(df['grade'])

In [None]:
df["grade"].cat.categories = ["very good", "good", "very bad"]
z(df)

In [None]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium",
                                              "good", "very good"])
z(df)

In [None]:
z(df["grade"])

In [None]:
df.groupby("grade").size()

In [None]:
df.groupby("grade").count()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.close('all')

In [None]:
data1 = np.random.randn(1000)
ts = pd.Series(data1,
               index=pd.date_range('1/1/2000', periods=len(data1)))
ts2 = ts.cumsum()
z(data1, ts, ts2)

In [None]:
fig1, ax1 = plt.subplots()
ts2.plot()

In [None]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])
df = df.cumsum()

In [None]:
# fig2, ax2 = plt.subplots()
df.plot()
plt.legend(loc='best')

In [None]:
pwd

In [None]:
df.to_excel('foo.xlsx', sheet_name='RUSH')

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
z(s)

In [None]:
a = s[0]
z(a)

In [None]:
b = s[0:3]
z(b)

In [None]:
c = s > s.median()

In [None]:
z(c)

In [None]:
s.dtype

In [None]:
s.array

In [None]:
s[1:]

In [None]:
df

In [None]:
type(df)

In [None]:
z(df)

In [None]:
z(s)

In [None]:
a = s[1:]
b = s[:-1]
c = a + b
z(a, b, c)

In [None]:
s = pd.Series(np.random.randn(5), name='something')
z(s)

In [None]:
data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'U10')])
z(data)
data.dtype

In [None]:
a = np.zeros((2, ))
z(a)
a.dtype

b = np.zeros((2, 1))
z(b)
b.dtype

In [None]:
data[0]

In [None]:
data[0] = (1, 2., 'Hello')

In [None]:
z(data)

In [None]:
data[1] = (2, 3., "World")
z(data)

In [None]:
a = pd.DataFrame(data)
z(a)

In [None]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
z(df)

In [None]:
df['one']

In [None]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
z(df)

In [None]:
del df['two']
three = df.pop('three')
z(three, df)

In [None]:
df['foo'] = 'bar'

In [None]:
a = df['one'][:2]
z(a)

In [None]:
b = df.iloc[:2].loc[:,'one']
z(b)

In [None]:
iris = pd.read_csv('iris.csv')
z(iris)

In [None]:
old = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
new = ['SepalLength',  'SepalWidth',  'PetalLength',  'PetalWidth', 'Name']
names = dict(zip(old, new))
z(names)

In [None]:
iris.rename(columns=names, inplace=True)

In [None]:
z(iris)

In [None]:
a = iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength'])
z(a)

In [None]:
df.loc['b']

In [None]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
z(df)

df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
del df['two']
three = df.pop('three')
df['foo'] = 'bar'
df['one_trunc'] = df['one'][:2]
df.insert(1, 'bar', df['one'])

z(df)



In [None]:
df.loc['b']

In [None]:
df.iloc[2]

In [None]:
df  = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
df3 = df + df2
z(df, df2, df3)

In [None]:
a = df - df.iloc[0]
z(a)

In [None]:
index = pd.date_range('1/1/2000', periods=8)
z(index)

In [None]:
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC'))
z(df)

In [None]:
a = df.sub(df['A'], axis=0)
b = df.sub(df['A'], axis=1)
z(df, a, b)


In [None]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)
display(df1, df2)

In [None]:
df1 & df2

In [None]:
df1 | df2

In [None]:
display(df1, -df1)

In [None]:
display(df1, ~df1)

In [None]:
-df1 is ~df1

In [None]:
df

In [None]:
display(df.T)

In [None]:
np.exp(df)

In [None]:
ser = pd.Series([4.5, 2, 3], index=['a', 'b', 'c'])
idx = pd.Index([4, 5, 6])
res = np.maximum(ser, idx)
z(ser, idx, res)

In [None]:
index = pd.date_range('1/1/2000', periods=8)
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
df = pd.DataFrame(np.random.randn(8, 3), index=index,
                 columns=['A', 'B', 'C'])
z(index, s, df)

In [None]:
df.columns = ['ah', 'bay', 'che']
z(df)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
s

In [None]:
a = s.array

In [None]:
b = s.to_numpy()

In [None]:
z(a,b, (a-b), (b-a))

In [None]:
ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
ser_np1 = ser.to_numpy()
ser_np2 = ser.to_numpy(dtype=object)
ser_np3 = ser.to_numpy(dtype="datetime64[ns]")

z(ser, ser_np1, ser_np2, ser_np3)

In [None]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
z(df)

In [None]:
row = df.iloc[1]
column = df['two']
z(row, column)

In [None]:
df.sub(row, axis=1)

In [None]:
df.sub(column, axis=0)

In [None]:
# Resumed at
# https://pandas.pydata.org/docs/user_guide/basics.html
# Missing data / operations with fill values

In [None]:
z(df, df['one'], s)

In [None]:
s2 = s.drop('e')
s2['d'] = np.nan
z(s2)

In [None]:
a = df['one'] > s2
z(a)

In [None]:
b = df > s2
z(b)

In [None]:
c =  s2 > df
z(c)

In [None]:
a = (df > 0).all()
z(a)

In [None]:
b = pd.DataFrame(columns=list('ABC'))
z(b)

In [None]:
b.empty

In [None]:
df.bool()

In [None]:
c = pd.Series([False])
z(c)

In [None]:
c.bool()

In [None]:
a = if df.all()

In [None]:
df.all()

In [None]:
df

In [None]:
z(df.count())
z(df.size)

In [None]:
df.count() == df.size

In [None]:
df1 = pd.DataFrame({'col': ['foo', 0, np.nan]})
df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0])
z(df1, df2)

In [None]:
df1.equals(df2)

In [None]:
df3 = df2.sort_index()
z(df3)

In [None]:
df1.equals(df3)

In [None]:
display(df1, df3)

In [None]:
a = pd.Series(['foo', 'bar', 'baz'])
b = pd.Index(['foo', 'bar', 'baz'])
c = (a == 'foo')
d = (b == 'foo')
z(a, b, c, d)


In [None]:
df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan],
                    'B': [np.nan, 2., 3., np.nan, 6.]})
df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
                    'B': [np.nan, np.nan, 3., 4., 6., 8.]})
display(df1, df2)

In [None]:
df1

In [None]:
(df1 - df1.mean())

In [None]:
df_dict = {'one': [1.394981, 0.343054, 0.695246, np.NaN],
           'two': [1.772517, 1.912123, 1.478369, 0.279344],
           'three':[np.NaN, -0.050390, 1.227435, -0.613172]}
df_index = ['a', 'b', 'c', 'd']
df = pd.DataFrame(df_dict, index=df_index)
z(df)

In [None]:
df.mean(0)

In [None]:
df.mean(1)

In [None]:
ts_dev = (df - df.mean())
ts_stand = (df - df.mean()) / df.std()
display(df, df.mean(0), ts_dev, ts_stand)

In [None]:
z = df.sub(df.mean(1), axis=0)
display(df, df.mean(1), z)

In [None]:
series1 = pd.Series(np.random.randn(1000))

In [None]:
series1
series1[::2] = np.nan

In [None]:
z(series1)
z(series1.count())
z(series1.size)
series1.describe()

In [None]:
s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a'])

In [None]:
s

In [None]:
s.describe()

In [None]:
frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})
z(frame)

In [None]:
frame.describe()

In [None]:
frame.describe(include='all')

In [None]:
s1 = pd.Series(np.random.randn(5))
z(s1)

In [None]:
 s1.idxmin(), s1.idxmax()

In [None]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
d(df1)

In [None]:
d(df1.idxmin(axis=0))
d(df1.idxmin(axis=1))

In [None]:
list('edcba')

In [None]:
a = np.arange(6).reshape(2,3) + 10
z(a)

In [None]:
np.argmin(a, axis=0)

In [None]:
data = np.random.randint(0, 7, size=50)
d(data)

In [None]:
s = pd.Series(data)
a = s.value_counts()
d(a)

In [None]:
data = {"a": [2, 2, 1, 4], "b": ["x", "x", "y", "y"]}
frame = pd.DataFrame(data)
vc = frame.value_counts()
display(data, frame, vc)

In [None]:
z(vc)

In [None]:
s5 = pd.Series([1, 1, 1, 1, 3, 5, 5, 7, 7, 7])
s5.mode()

In [None]:
arr = np.random.randn(20)
factor = pd.cut(arr, 4)
z(factor)

In [None]:
factor = pd.cut(arr, [-5, -1, 0, 1, 5])
factor

In [None]:
factor[0]

In [None]:
vars(factor)

In [None]:
arr

In [None]:
df= pd.DataFrame({'number': np.random.randint(1, 100, 10)}) 
df['bins'] = pd.cut(x=df['number'], bins=[1, 20, 40, 60,  
                                          80, 100]) 
print(df) 

In [None]:
z(df['bins'].value_counts())

In [None]:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df['city_name'] = df['city_and_code'].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = 'city_name'
    df['city_and_country'] = df[col] + country_name
    return df


df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']})
z(df_p)

In [None]:
extract_city_name(df_p)

In [None]:
add_country_name(df_p, country_name='US')
d(df_p)

In [None]:
df

In [None]:
df_dict = {'one': [1.394981, 0.343054, 0.695246, np.NaN],
           'two': [1.772517, 1.912123, 1.478369, 0.279344],
           'three':[np.NaN, -0.050390, 1.227435, -0.613172]}
df_index = ['a', 'b', 'c', 'd']
df = pd.DataFrame(df_dict, index=df_index)
z(df)

In [None]:
df.apply(np.mean)

In [None]:
tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'],
                    index=pd.date_range('1/1/2000', periods=1000))
d(tsdf)

In [None]:
a = tsdf.idxmax()
b = tsdf.max()
z(a, b)

In [None]:
c = tsdf.loc[a]
z(c)

In [None]:
tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'],
                    index=pd.date_range('1/1/2000', periods=10))
tsdf.iloc[3:7] = np.nan
z(tsdf)

In [None]:
 tsdf.transform(np.abs)

In [None]:
a = """        one       two     three
a  1.394981  1.772517       NaN
b  0.343054  1.912123 -0.050390
c  0.695246  1.478369  1.227435
d       NaN  0.279344 -0.613172"""
df = read_df(a)
display(df)

In [None]:
def f(x):
    return len(str(x))

In [None]:
df4['one'].apply(np.sum)

In [None]:
df4['one'].map(f)

In [None]:
df4.applymap(f)

In [None]:
s = pd.Series(['six', 'seven', 'six', 'seven', 'eight'],
              index=['a', 'b', 'c', 'd', 'e'])
t = pd.Series({'six': 6., 'seven': 7.})
z(s,t)

In [None]:
s.map(t)

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
z(s)

In [None]:
s2 = s.reindex(['e', 'b', 'f', 'd'])
z(s2)

In [None]:
df

In [None]:
df.reindex(columns=['three', 'two', 'one'])

In [None]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s2 = s[1:]
z(s, s1, s2)

In [None]:
s1.align(s2)

In [None]:
s1.align(s2, join='outer')

In [None]:
s1.align(s2, join='inner')

In [None]:
s1.align(s2, join='left')

In [None]:
s1.align(s2, join='right')

In [None]:
z(df, df2)

In [None]:
df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan],
                    'B': [np.nan, 2., 3., np.nan, 6.]})
df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
                    'B': [np.nan, np.nan, 3., 4., 6., 8.]})
display(df1, df2)

In [None]:
df.align(df2, join='inner')

In [None]:
dft = """        one       two     three
a  1.394981  1.772517       NaN
b  0.343054  1.912123 -0.050390
c  0.695246  1.478369  1.227435
d       NaN  0.279344 -0.613172"""

df2t = """        one       two
a  1.394981  1.772517
b  0.343054  1.912123
c  0.695246  1.478369"""
df = read_df(dft)
df2 = read_df(df2t)
display(df, df2)

In [None]:
df.align(df2, join='inner')

In [None]:
df.align(df2, join='inner', axis=0)

In [None]:
df.align(df2, join='inner', axis=1)

In [None]:
rng = pd.date_range('1/3/2000', periods=8)
ts = pd.Series(np.random.randn(8), index=rng)
ts2 = ts[[0, 3, 6]]
d(rng, ts, ts2)

In [None]:
df = pd.DataFrame({'col1': np.random.randn(3),
                   'col2': np.random.randn(3)}, index=['a', 'b', 'c'])

In [None]:
for col in df:
    print(type(col), col)

In [None]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s1 = s[:4]
s2 = s[1:]

In [None]:
display(s, df)

In [None]:
for i in s:
    print(type(i), i)

In [None]:
for i, val in s.items():
    print(type(i), i)
    print(type(val), val)

In [None]:
for col, series in df.items():
    print(type(col), col)
    print(type(series), '\n', series)

In [None]:
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
z(s)

In [None]:
s.dt.strftime('%Y/%m/%d')

In [None]:
s

In [None]:
s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s'))
s

In [None]:
 s.dt.days

In [None]:
s.dt.seconds

In [None]:
s.dt.components

In [None]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
                dtype="string")

In [None]:
s

In [None]:
s.str.lower()

In [None]:
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                         columns=['three', 'two', 'one'])
display(df, unsorted_df)

In [None]:
unsorted_df.sort_index()

In [None]:
s = pd.DataFrame({
    "a": ['B', 'a', 'C'],
    "b": [1, 2, 3],
    "c": [2, 3, 4]})
s1 = s.set_index(list("ab"))
d(s,s1)

In [None]:
s1.sort_index(level="a")

In [None]:
df1 = pd.DataFrame({'one': [2, 1, 1, 1],
                    'two': [1, 3, 2, 4],
                    'three': [5, 4, 3, 2]})
d(df1)

In [None]:
df1.sort_values(by='two')

In [None]:
s

In [None]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
              dtype="string")
d(s)

In [None]:
s.str.lower()

In [None]:
s.sort_values()

In [None]:
df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]})
d(df)

In [None]:
df.sort_values(by='a')

In [None]:
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2),
                                ('b', 2), ('b', 1), ('b', 1)])
idx.names = ['first', 'second']
df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)},
                        index=idx)
z(idx, df_multi)

In [None]:
s = pd.Series(np.random.permutation(10000))
# d(s)

In [None]:
%%timeit
s.sort_values()

In [None]:
%%timeit
s.nsmallest(3), s.nlargest(3)

In [None]:
df1.columns = pd.MultiIndex.from_tuples([('a', 'one'),
                                         ('a', 'two'),
                                         ('b', 'three')])
z(df1)

In [None]:
# df1.sort_values(by=(('a', 'two'), ('a', 'one')))

In [None]:
dft = pd.DataFrame({'A': np.random.rand(3),
                   'B': 1,
                   'C': 'foo',
                   'D': pd.Timestamp('20010102'),
                   'E': pd.Series([1.0] * 3).astype('float32'),
                   'F': False,
                   'G': pd.Series([1] * 3, dtype='int8')})

In [None]:
d(dft)

In [None]:
d(dft.dtypes)

In [None]:
dft['H']=True

In [None]:
dft.dtypes.value_counts()

In [None]:
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')
d(df1)

In [None]:
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
                    'B': pd.Series(np.random.randn(8)),
                    'C': pd.Series(np.array(np.random.randn(8),
                                            dtype='uint8'))})
z(df2)

In [None]:
a = df2.dtypes
a.name = 'MF Type'
z(a)

In [None]:
z(df1, df2)

In [None]:
df3 = df1.reindex_like(df2).fillna(value=0.0) + df2
z(df3)

In [None]:
import datetime
df = pd.DataFrame([[1, 2],
                  ['a', 'b'],
                  [datetime.datetime(2016, 3, 2),
                   datetime.datetime(2016, 3, 2)]])
dft = df.T
z(df, dft)

In [None]:
dft.dtypes

In [None]:
dft.infer_objects().dtypes

In [None]:
m = ['1.1', 2, 3]

In [None]:
m

In [None]:
type(m)

In [None]:
# I/O User Guide
# https://pandas.pydata.org/docs/user_guide/io.html
# Started: 11/27/20

In [None]:
string ='This is initial string.'

In [None]:
file = StringIO(string) 
z(file)

In [None]:
a = file.read()
z(a)

In [None]:
file.write(" Welcome to geeksforgeeks.")

In [None]:
z(a, file)

In [None]:
dir(file)

In [None]:
repr(file.getvalue())

In [None]:
file.seek(0)

In [None]:
print('The string after writing is:', file.read())  

In [None]:
data = ('a,b,c,d\n'
        '1,2,3,4\n'
        '5,6,7,8\n'
        '9,10,11')
z(data)
print(data)

In [None]:
df = pd.read_csv(StringIO(data), dtype=object)
z(df)

In [None]:
df2 = pd.read_csv(StringIO(data))
z(df2)

In [None]:
df = pd.read_csv(StringIO(data),
                 dtype={'b': object, 
                        'c': np.float64, 
                        'd': 'Int64'})
z(df)

In [None]:
a = df['d'][2]

In [None]:
z(a)

In [None]:
data = ("col_1\n"
        "1\n"
        "2\n"
        "'A'\n"
        "4.22")
z(data)
print(data)

In [None]:
df = pd.read_csv(StringIO(data), converters={'col_1': str})
z(df)

In [None]:
df['col_1'].apply(type).value_counts()

In [None]:
df2 = pd.read_csv(StringIO(data))
z(df2)

In [None]:
df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')
z(df2)

In [None]:
df2['col_1'].apply(type).value_counts()

In [None]:
data = ('a,b,c\n'
        '1,2,3\n'
        '4,5,6\n'
        '7,8,9')
print(data)

In [None]:
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'])

In [None]:
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0)

In [None]:
data = ('skip this skip it\n'
        'a,b,c\n'
        '1,2,3\n'
        '4,5,6\n'
        '7,8,9')
print(data)

In [None]:
pd.read_csv(StringIO(data), header=1)

In [None]:
pd.read_csv(StringIO(data), header=2)

In [None]:
data = ('a,b,a\n'
        '0,1,2\n'
        '3,4,5')
print(data)

In [None]:
pd.read_csv(StringIO(data))

In [None]:
# a = pd.read_csv(StringIO(data), mangle_dupe_cols=False)
z(a)

In [None]:
data = ('\n'
        'a,b,c\n'
        '  \n'
        '# commented line\n'
        '1,2,3\n'
        '\n'
        '4,5,6')
print(data)

In [None]:
data = ('a,b,c\n'
        '4,apple,bat,\n'
        '8,orange,cow,')
print(data)

In [None]:
a = pd.read_csv(StringIO(data), index_col=False)
d(a)

In [None]:
text01 = """id8141    360.242940   149.910199   11950.7
id1594    444.953632   166.985655   11788.4
id1849    364.136849   183.628767   11806.2
id1230    413.836124   184.375703   11916.8
id1948    502.953953   173.237159   12468.3"""
print(text01)

In [None]:
df01 = pd.read_fwf(StringIO(text01), header=None)
df02 = pd.read_fwf(StringIO(text01))

d(df01, df02)

In [None]:
a = df01.to_string()
print(a)

In [None]:
type(a)

In [None]:
fn = 'MLBPlayerSalaries.xlsx'
df = pd.read_excel(fn, sheet_name='MLBPlayerSalaries', index_col='Player')


In [None]:
d(df)

In [None]:
xlsx = pd.ExcelFile(fn)

In [None]:
df2 = pd.read_excel(xlsx, 'data1')
z(df2)

In [None]:
df2 = pd.read_excel(fn, sheet_name=None)
z(df2)

In [None]:
df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]},
               index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']]))
z(df)

In [None]:
df.to_excel('df1_output.xlsx')

In [None]:
df

In [None]:
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
                    'B': pd.Series(np.random.randn(8)),
                    'C': pd.Series(np.array(np.random.randn(8),
                                            dtype='uint8'))})
z(df2)

In [None]:
a = df2.loc[[0, 1]]
b = df2.loc[[0, 1], :]
print(a == b)
print(all(a==b))
print(a is b)
d(a)

In [None]:
df2.loc[[0, 1], ['B', 'C']]

In [None]:
df2.loc[:, ['B', 'C']]

In [None]:
indices = list('abcdefgh')
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])

df2 = pd.DataFrame(np.random.randn(8, 4),
                  index=indices, columns=['A', 'B', 'C', 'D'])

z(df, df2)

In [None]:
print(df2['A'])
# df['a']  # gives an error

In [None]:
s = df['A']
s[dates[5:7]]

In [None]:
df

In [None]:
df_text = """                   A         B         C         D
2000-01-01  0.469112 -0.282863 -1.509059 -1.135632
2000-01-02  1.212112 -0.173215  0.119209 -1.044236
2000-01-03 -0.861849 -2.104569 -0.494929  1.071804
2000-01-04  0.721555 -0.706771 -1.039575  0.271860
2000-01-05 -0.424972  0.567020  0.276232 -1.087401
2000-01-06 -0.673690  0.113648 -1.478427  0.524988
2000-01-07  0.404705  0.577046 -1.715002 -1.039268
2000-01-08 -0.370647 -1.157892 -1.344312  0.844885"""

In [None]:
df = read_df(df_text)

In [None]:
d(df)

In [None]:
sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()

In [None]:
dfa.A

In [None]:
dfa.A = list(range(len(dfa.index)))  # ok if A already exists
dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column

In [None]:
dfa

In [None]:
s

In [None]:
s1 = s[:5]
s2 = s[5]
z(s1, s2)

In [None]:
df[:3]

In [None]:
df.iloc[3]

In [None]:
dfl = pd.DataFrame(np.random.randn(5, 4),
                   columns=list('ABCD'),
                   index=pd.date_range('20130101', periods=5))

In [None]:
d(dfl)

In [None]:
dfl['2013-01-03':'2013-01-05']

In [None]:
dfl[2:4]

In [None]:
'2013-01-03':'2013-01-05'

In [None]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
d(s1)

In [None]:
s1['a':'c']

In [None]:
s1.loc['a':'c']

In [None]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                   index=list('abcdef'),
                   columns=list('ABCD'))

In [None]:
d(df1)

In [None]:
df2 = df1.copy()
df2.loc[['a', 'd'], 'A':'C'] = False
d(df2)

In [None]:
df1.loc['a']

In [None]:
t1 = df1.loc['a'] > 0
t2 = df1.loc[:, 'C'] > 0
d(t1, t2)

In [None]:
df2 = df1.copy()
df2.loc[t2, t1] = True
d(df2)

In [None]:
df2 = df1.copy()
df2.loc[t2, 'C':'D'] = True
d(df2)

In [None]:
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])

In [None]:
d(s)

In [None]:
s[2]

In [None]:
s.loc[2]

In [None]:
s.loc[2:4]

In [None]:
s.loc[2:]

In [None]:
s

In [None]:
s[3:]

In [None]:
s.loc[3:]

In [None]:
s.loc[:5]

In [None]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
d(s1)

In [None]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                  index=list(range(0, 12, 2)),
                  columns=list(range(0, 8, 2)))
d(df1)

In [None]:
df1.iloc[1, 1]

In [None]:
df1.iloc[1]

In [None]:
df1.iloc[1, :]

In [None]:
df1.iloc[:, 1]

In [None]:
df1.iloc[3:2]

In [None]:
s.iloc[8:10]

In [None]:
dfl = pd.DataFrame(np.random.randn(5, 2), columns=list('AB'))

In [None]:
df1

In [None]:
df1 = pd.DataFrame(np.random.randn(6, 4),
                  index=list('abcdef'),
                  columns=list('ABCD'))
d(df1)

In [None]:
df0_text = """          A         B         C         D
a -0.023688  2.410179  1.450520  0.206053
b -0.251905 -2.213588  1.063327  1.266143
c  0.299368 -0.863838  0.408204 -1.048089
d -0.025747 -0.988387  0.094055  1.262731
e  1.289997  0.082423 -0.055758  0.536580
f -0.489682  0.369374 -0.034571 -2.484478"""
df0 = read_df(df0_text)
df1 = df0.astype(float)
z(df1)

In [None]:
a = df1['A'] > 0
d(a)

In [None]:
df1.loc[a, :]

In [None]:
def a_gt_0(df):
    out = df['A']> 0
#     print(out)
    return out

In [None]:
a_gt_0(df1)

In [None]:
df1.loc[a_gt_0]

In [None]:
def always_d(df):
    return 'D'

In [None]:
df1.loc[:, always_d]

In [None]:
df1.columns

In [None]:
df1.columns[0]

In [None]:
ser = df1['A']
z(ser)

In [None]:
def print_series(s):
    z(s)
    out = (s > 0)
    return out

In [None]:
ser.loc[print_series]

In [None]:
s = pd.Series([1, 2, 3])

In [None]:
s.loc[[1, 2, 3]]

In [None]:
labels = [1, 2, 3]

In [None]:
s.index.intersection(labels)

In [None]:
 s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])

In [None]:
z(s)

In [None]:
labels = ['c', 'd']

In [None]:
# s.reindex(labels)

In [None]:
s.index.intersection(labels)

In [None]:
s.loc[s.index.intersection(labels)]

In [None]:
s1 = s.loc[s.index.intersection(labels)].reindex(labels)
print(s1)

In [None]:
s

In [None]:
s.index

In [None]:
# s.reindex(['b', 'c'])

In [None]:
s2 = pd.Series(np.arange(4), index=['a', 'z', 'b', 'c'])
s2.index

In [None]:
s2.reindex(['z', 'c'])

In [None]:
df1

In [None]:
df1.reindex(columns=['D', 'A'])

In [None]:
df1.reindex(index=['d', 'e', 'f'])

In [None]:
df1.reindex(index=['d', 'f'])

In [None]:
s = pd.Series([0, 1, 2, 3, 4, 5])

In [None]:
s

In [None]:
s = pd.Series(range(-3, 4))

In [None]:
a = (s < -1)
b = (s > 0.5)
c = a | b
z(a, b, c)

In [None]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
                    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
                    'c': np.random.randn(7)})
z(df2)

In [None]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
                 'ids2': ['a', 'n', 'c', 'n']})
d(df)

In [None]:
values = ['a', 'b', 1, 3]
df.isin(values)

In [None]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
z(s)

In [None]:
s[s > 0]

In [None]:
a = s.where(s > 0)
z(a)

In [None]:
df

In [None]:
indices = list('abcdefgh')
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4),
                  index=dates, columns=['A', 'B', 'C', 'D'])

df2 = pd.DataFrame(np.random.randn(8, 4),
                  index=indices, columns=['A', 'B', 'C', 'D'])
# z(df, df2)
d(df)

In [None]:
df.where(df > 0, -99)

In [None]:
df

In [None]:
df.loc[:, 'B':'A']

In [None]:
 df2 = df.copy()

In [None]:
df2

In [None]:
df2[1:4]

In [None]:
df2[1:4] > 0

In [None]:
df2[df2[1:4] > 0]

In [None]:
df2[df2[1:4] > 0] = 3
df2

In [None]:
df2 = df.copy()
df2

In [None]:
df2.where(df2 > 0, df2['A'], axis='index')

In [None]:
df3 = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6],
                    'C': [7, 8, 9]})
d(df3)

In [None]:
df3.where(lambda x: x > 4, lambda x: x + 10)

In [None]:
df3.where(lambda x: x > 4, lambda x: x + 10)

In [None]:
t1 = """          a         b         c
0  0.438921  0.118680  0.863670
1  0.138138  0.577363  0.686602
2  0.595307  0.564592  0.520630
3  0.913052  0.926075  0.616184
4  0.078718  0.854477  0.898725
5  0.076404  0.523211  0.591538
6  0.792342  0.216974  0.564056
7  0.397890  0.454131  0.915716
8  0.074315  0.437913  0.019794
9  0.559209  0.502065  0.026437"""
df = read_df(t1)

In [None]:
df

In [None]:
a = df[(df['a'] < df['b']) & (df['b'] < df['c'])]
b =  df.query('(a < b) & (b < c)')
all(a == b)

In [None]:
%%timeit
a = df[(df['a'] < df['b']) & (df['b'] < df['c'])]

In [None]:
%%timeit
b =  df.query('(a < b) & (b < c)')

In [None]:
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})

In [None]:
df

In [None]:
df.query('a in b')

In [None]:
df.query('a in b and c < d')

In [None]:
d(df)
df.query('b == ["a", "b", "c"]')

In [None]:
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'two', 'two', 'three', 'four'],
                    'b': ['x', 'y', 'x', 'y', 'x', 'x', 'x'],
                    'c': np.random.randn(7)})

In [None]:
df2

In [None]:
df2.duplicated('a')

In [None]:
df3 = pd.DataFrame({'a': np.arange(6),
                    'b': np.random.randn(6)},
                    index=['a', 'a', 'b', 'c', 'b', 'a']) 
df3

In [None]:
s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])

In [None]:
s.get('a')  # equivalent to s['a']

In [None]:
s.get('x', default=-1)

In [None]:
dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D'])
dflookup

In [None]:
dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D'])

In [None]:
dflookup.loc[list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D']]

In [22]:
index = pd.Index(['e', 'd', 'a', 'b'], name='rumsticky')

Index(['e', 'd', 'a', 'b'], dtype='object', name='rumsticky')

In [None]:
'd' in index

In [None]:
ind = pd.Index([1, 2, 3])

In [None]:
ind = pd.Index([1, 2, 3])
ind

In [None]:
a = ind.rename("apple")
z(a)

In [None]:
ind.set_names(["apple"], inplace=True)
ind

In [None]:
ind.name = "bob"
ind

In [None]:
index = pd.MultiIndex.from_product(
    [range(3), ['one', 'two']], names=['first', 'second'])
index

In [None]:
index.levels[1]

In [None]:
index._tuples

In [None]:
lev0 = [i[0] for i in index._tuples]
lev1 = [i[1] for i in index._tuples]
d(lev0, lev1)

In [None]:
vars(index)

In [None]:
d(index.levels[0])
d(index.levels[1])

In [None]:
a = index.set_levels(["a", "b"], level=1)
d(a)

In [24]:
a = pd.Index(['c', 'b', 'a'])
b = pd.Index(['c', 'e', 'd'])
z(a,b)
z(a | b)
z(a & b)
z(a.difference(b))
z(a ^ b)

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 3


Index(['c', 'b', 'a'], dtype='object')

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 3


Index(['c', 'e', 'd'], dtype='object')

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 5


Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 1


Index(['c'], dtype='object')

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 2


Index(['a', 'b'], dtype='object')

----------------------------------------
Type: <class 'pandas.core.indexes.base.Index'>
Length: 4


Index(['a', 'b', 'd', 'e'], dtype='object')

In [26]:
idx1 = pd.Index([1, 2, 3, 4])
idx2 = pd.Index([2, 3, 4, 5])
idx3 = idx1 ^ idx2
z(idx1, idx2, idx3)

----------------------------------------
Type: <class 'pandas.core.indexes.numeric.Int64Index'>
Length: 4


Int64Index([1, 2, 3, 4], dtype='int64')

----------------------------------------
Type: <class 'pandas.core.indexes.numeric.Int64Index'>
Length: 4


Int64Index([2, 3, 4, 5], dtype='int64')

----------------------------------------
Type: <class 'pandas.core.indexes.numeric.Int64Index'>
Length: 2


Int64Index([1, 5], dtype='int64')

In [27]:
idx1 = pd.Index([1, np.nan, 3, 4])

Float64Index([1.0, nan, 3.0, 4.0], dtype='float64')

In [28]:
idx1.fillna(2)

Float64Index([1.0, 2.0, 3.0, 4.0], dtype='float64')

In [29]:
idx2 = pd.DatetimeIndex([pd.Timestamp('2011-01-01'),
                         pd.NaT,
                         pd.Timestamp('2011-01-03')])

DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03'], dtype='datetime64[ns]', freq=None)

In [31]:
df_text="""     a    b  c    d
0  bar  one  z  1.0
1  bar  two  y  2.0
2  foo  one  x  3.0
3  foo  two  w  4.0"""

'     a    b  c    d\n0  bar  one  z  1.0\n1  bar  two  y  2.0\n2  foo  one  x  3.0\n3  foo  two  w  4.0'

In [44]:
data = read_df(df_text)

Unnamed: 0,a,b,c,d
0,bar,one,z,1.0
1,bar,two,y,2.0
2,foo,one,x,3.0
3,foo,two,w,4.0


In [33]:
indexed1 = data.set_index('c')

Unnamed: 0_level_0,a,b,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
z,bar,one,1.0
y,bar,two,2.0
x,foo,one,3.0
w,foo,two,4.0


In [37]:
indexed2 = data.set_index(['a', 'b'])
z(indexed2)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, ('bar', 'one') to ('foo', 'two')
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   c       4 non-null      object
 1   d       4 non-null      object
dtypes: object(2)
memory usage: 204.0+ bytes
None


Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,z,1.0
bar,two,y,2.0
foo,one,x,3.0
foo,two,w,4.0


In [38]:
frame = data.set_index('c', drop=False)
z(frame)

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, z to w
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       4 non-null      object
 1   b       4 non-null      object
 2   c       4 non-null      object
 3   d       4 non-null      object
dtypes: object(4)
memory usage: 80.0+ bytes
None


Unnamed: 0_level_0,a,b,c,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
z,bar,one,z,1.0
y,bar,two,y,2.0
x,foo,one,x,3.0
w,foo,two,w,4.0


In [39]:
frame = frame.set_index(['a', 'b'], append=True)
z(frame)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4 entries, ('z', 'bar', 'one') to ('w', 'foo', 'two')
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   c       4 non-null      object
 1   d       4 non-null      object
dtypes: object(2)
memory usage: 346.0+ bytes
None


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,c,d
c,a,b,Unnamed: 3_level_1,Unnamed: 4_level_1
z,bar,one,z,1.0
y,bar,two,y,2.0
x,foo,one,x,3.0
w,foo,two,w,4.0


In [43]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,z,1.0
bar,two,y,2.0
foo,one,x,3.0
foo,two,w,4.0


In [45]:
data.set_index('c', drop=False)

Unnamed: 0_level_0,a,b,c,d
c,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
z,bar,one,z,1.0
y,bar,two,y,2.0
x,foo,one,x,3.0
w,foo,two,w,4.0


In [46]:
data.set_index(['a', 'b'], inplace=True)

In [47]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
a,b,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,z,1.0
bar,two,y,2.0
foo,one,x,3.0
foo,two,w,4.0


In [48]:
data.reset_index()

Unnamed: 0,a,b,c,d
0,bar,one,z,1.0
1,bar,two,y,2.0
2,foo,one,x,3.0
3,foo,two,w,4.0


In [49]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,c,d
c,a,b,Unnamed: 3_level_1,Unnamed: 4_level_1
z,bar,one,z,1.0
y,bar,two,y,2.0
x,foo,one,x,3.0
w,foo,two,w,4.0


In [50]:
frame.reset_index(level=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,c,d
c,b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
z,one,bar,z,1.0
y,two,bar,y,2.0
x,one,foo,x,3.0
w,two,foo,w,4.0


In [51]:
dfmi = pd.DataFrame([list('abcd'),
           list('efgh'),
           list('ijkl'),
           list('mnop')],
          columns=pd.MultiIndex.from_product(
              [['one', 'two'],
               ['first', 'second']]))

Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,b,c,d
1,e,f,g,h
2,i,j,k,l
3,m,n,o,p


In [53]:
z(dfmi['one'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   4 non-null      object
 1   second  4 non-null      object
dtypes: object(2)
memory usage: 96.0+ bytes
None


Unnamed: 0,first,second
0,a,b
1,e,f
2,i,j
3,m,n


In [54]:
z(dfmi['one']['second'])

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 4


0    b
1    f
2    j
3    n
Name: second, dtype: object

In [55]:
z(dfmi.loc[:, ('one')])
z(dfmi.loc[:, ('one', 'second')])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   4 non-null      object
 1   second  4 non-null      object
dtypes: object(2)
memory usage: 96.0+ bytes
None


Unnamed: 0,first,second
0,a,b
1,e,f
2,i,j
3,m,n


----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 4


0    b
1    f
2    j
3    n
Name: (one, second), dtype: object

In [56]:
dfb = pd.DataFrame({'a': ['one', 'one', 'two',
                          'three', 'two', 'one', 'six'],
                    'c': np.arange(7)})

Unnamed: 0,a,c
0,one,0
1,one,1
2,two,2
3,three,3
4,two,4
5,one,5
6,six,6


In [57]:
dfb['c'][dfb['a'].str.startswith('o')] = 42

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfb['c'][dfb['a'].str.startswith('o')] = 42


In [58]:
dfb

Unnamed: 0,a,c
0,one,42
1,one,42
2,two,2
3,three,3
4,two,4
5,one,42
6,six,6


In [59]:
dfb[dfb['a'].str.startswith('o')]['c'] = 19

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfb[dfb['a'].str.startswith('o')]['c'] = 19


In [60]:
dfb

Unnamed: 0,a,c
0,one,42
1,one,42
2,two,2
3,three,3
4,two,4
5,one,42
6,six,6


In [61]:
dfc = pd.DataFrame({'a': ['one', 'one', 'two',
                          'three', 'two', 'one', 'six'],
                    'c': np.arange(7)})

Unnamed: 0,a,c
0,one,0
1,one,1
2,two,2
3,three,3
4,two,4
5,one,5
6,six,6


In [62]:
dfd = dfc.copy()

Unnamed: 0,a,c
0,one,0
1,one,1
2,two,2
3,three,3
4,two,4
5,one,5
6,six,6


In [63]:
mask = dfd['a'].str.startswith('o')

0     True
1     True
2    False
3    False
4    False
5     True
6    False
Name: a, dtype: bool

In [64]:
dfd.loc[mask, 'c'] = 42

In [65]:
dfd

Unnamed: 0,a,c
0,one,42
1,one,42
2,two,2
3,three,3
4,two,4
5,one,42
6,six,6
