# 📘 Pandas `DataFrame` — Commands & Examples (Extensive Cheat Sheet)

**Generated:** 2025-09-02 01:28 UTC · **Target pandas:** ≥ 2.x

This notebook mirrors the HTML cheat sheet and focuses on the `pandas.DataFrame` API: creation, selection, transformation, aggregation, reshaping, I/O, time-series helpers, and styling.

> **Deprecations you should know**  
> • `DataFrame.append` was removed in pandas 2.0 — use `pd.concat` instead.  
> • `DataFrame.applymap` was deprecated in 2.1 — use `DataFrame.map` for elementwise functions.

---

## Setup

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.__version__

## 1) Constructing DataFrames

In [None]:
# From dict of lists/arrays
df = pd.DataFrame({
    'city': ['Paris','Lyon','Lille'],
    'pop_k': [2148, 522, 232],
    'founded': [ -52, 43, 640]  # BCE/CE year for example
})
df

In [None]:
# From list of dicts
rows = [ {'a':1,'b':2}, {'a':3,'b':4} ]
pd.DataFrame(rows)

In [None]:
# From 2D ndarray + columns
arr = np.array([[1,2,3],[4,5,6]])
pd.DataFrame(arr, columns=list('abc'))

In [None]:
# From records / structured ndarray
rec = np.array([(1,2),(3,4)], dtype=[('x','i4'),('y','i4')])
pd.DataFrame.from_records(rec)

In [None]:
# From another DataFrame (subset, copy)
base = pd.DataFrame({'x':[1,2,3],'y':[4,5,6]})
sub  = pd.DataFrame(base, columns=['y'], copy=True)
sub

## 2) Inspecting structure & metadata

In [None]:
df.head(3)

In [None]:
df.tail(2)

In [None]:
df.sample(3, random_state=0)

In [None]:
# Dimensions & labels
df.shape, df.index, df.columns

In [None]:
# Dtypes & memory
df.dtypes, df.memory_usage(deep=True)

In [None]:
# Info (prints to stdout)
df.info()

In [None]:
# NumPy view
vals = df.to_numpy()
vals

In [None]:
# Describe
df.describe(include='all', datetime_is_numeric=True)

## 3) Selecting & filtering

In [None]:
# Column selection
df['city'], df[['city','pop_k']]

In [None]:
# Row slicing by position
df[0:2]

In [None]:
# Label-based selection
df_loc = df.set_index('city')
(
    df_loc.loc['Paris'],
    df_loc.loc['Paris','pop_k'],
    df_loc.loc['Lyon':'Paris'],
    df_loc.loc[lambda d: d.pop_k > 500]
)

In [None]:
# Position-based selection
(
    df.iloc[0],
    df.iloc[[0,2], [1,2]],
    df.iloc[:, 0:2]
)

In [None]:
# Fast scalar accessors
(df.at[0, 'city'], df.iat[1, 2])

In [None]:
# Boolean filtering
df[df['pop_k'] > 500]

In [None]:
# Query API
df.query('pop_k > 500 and founded < 0')

In [None]:
# isin / between
df[df['city'].isin(['Paris','Lille'])], df[df['pop_k'].between(200, 600)]

## 4) Assigning & modifying

In [None]:
# New / transformed columns
df = df.copy()
df['pop_m'] = df['pop_k'] / 1000

df2 = df.assign(density=lambda d: d.pop_k / 105.4)
df2.head()

In [None]:
# Insert at specific position
df3 = df.copy()
df3.insert(1, 'country', 'FR', allow_duplicates=False)
df3.head()

In [None]:
# Rename columns/index
df_renamed = df.rename(columns={'pop_k':'population_k'})
df_renamed = df_renamed.rename_axis(index='row_id', columns='field')
df_renamed.head()

In [None]:
# Replace values
df.replace({'Paris':'PAR'}, subset=['city']).head()

In [None]:
# Drop columns/rows
(df.drop(columns=['founded']).head(), df.drop(index=[0]).head())

In [None]:
# Set / reset index
df2 = df.set_index(['city'])
df2.reset_index().head()

In [None]:
# Update in place from another DF (matching index/columns)
a = pd.DataFrame({'x':[1,np.nan]}, index=['r1','r2'])
b = pd.DataFrame({'x':[9,8]}, index=['r2','r3'])
a.update(b)
a

## 5) Missing data

In [None]:
# Detect
(df.isna().head(), df.notna().head())

In [None]:
# Drop
(df.dropna().head(), df.dropna(axis=1).head(), df.dropna(subset=['city']).head())

In [None]:
# Fill
(
    df.fillna({'city':'Unknown'}).head(),
    df['pop_k'].fillna(df['pop_k'].median())
)

In [None]:
# Interpolate numerics
df['pop_k'].interpolate(method='linear')

## 6) Dtypes & conversion

In [None]:
# Convert dtype
conv = df.copy()
conv['pop_k'] = conv['pop_k'].astype('int64')
conv = conv.astype({'founded':'Int64'})
conv.dtypes

In [None]:
# Best possible dtypes (uses pd.NA where appropriate)
df.convert_dtypes().dtypes

In [None]:
# Categoricals for memory/speed
cat = df.copy()
cat['city'] = cat['city'].astype('category')
cat.dtypes

In [None]:
# Datetime conversion
dates = pd.to_datetime(['2024-01-01','2024-01-02'])
pd.DataFrame({'d':dates}).assign(day_name=lambda d: d['d'].dt.day_name())

## 7) Arithmetic & comparisons

In [None]:
# Vectorized ops align on labels
df_loc = df.set_index('city')
s = pd.Series({'Paris':10,'Lyon':5})
df_loc.assign(score=s)

In [None]:
# Elementwise arithmetic (broadcasts scalars)
df_num = pd.DataFrame({'a':[1,2],'b':[3,4]})
(
    df_num.add(10),
    df_num.sub(df_num['a'], axis=0),
    df_num.mul(df_num, fill_value=1),
    df_num.div(2),
    df_num.pow(2)
)

In [None]:
# Reverse ops (radd, rsub, ...)
10 + df_num

In [None]:
# Comparisons & clip
(df_num.ge(2), df_num.eq(3), df_num.clip(lower=2, upper=3))

## 8) Descriptive statistics

In [None]:
(
    df_num.sum(), df_num.mean(), df_num.median(), df_num.std(), df_num.var(),
    (df_num.min(), df_num.max()),
    (df_num.idxmin(), df_num.idxmax()),
    df_num.quantile([0.25,0.5,0.75]),
    (df_num.cumsum(), df_num.cumprod(), df_num.cummin(), df_num.cummax()),
    df_num.nunique(),
    df_num.value_counts()
)

In [None]:
# Correlation / covariance
stocks = pd.DataFrame({'A':[1,2,3,4], 'B':[1,1,2,3]})
(stocks.corr(method='pearson'), stocks.cov())

In [None]:
# Rank
stocks.rank(method='average')

## 9) Sorting, ranking, sampling

In [None]:
df.sort_values(by=['pop_k','city'], ascending=[False, True]).head()

In [None]:
df.sort_index().head()

In [None]:
(df.nlargest(2, 'pop_k'), df.nsmallest(2, 'pop_k'))

In [None]:
df.sample(frac=0.5, random_state=42)

## 10) Reshaping & pivoting

In [None]:
# Wide ↔ long
sales = pd.DataFrame({
  'city':['Paris','Paris','Lyon','Lyon'],
  'year':[2023,2024,2023,2024],
  'rev':[10,12,4,5]
})

wide = sales.pivot(index='city', columns='year', values='rev')
long = wide.reset_index().melt(id_vars='city', var_name='year', value_name='rev')
(wide, long.head())

In [None]:
# Stack / unstack (MultiIndex)
mi = sales.set_index(['city','year'])
mi_unstacked = mi.unstack('year')
mi_stacked   = mi_unstacked.stack('year')
(mi_unstacked, mi_stacked)

In [None]:
# Transpose
wide.T

## 11) Combine, join, merge, concat

In [None]:
# Merge (SQL-style joins)
left  = pd.DataFrame({'k':[1,2,3], 'v_left':['a','b','c']})
right = pd.DataFrame({'k':[2,3,4], 'v_right':['x','y','z']})
(
    left.merge(right, on='k', how='inner'),
    left.merge(right, on='k', how='left')
)

In [None]:
# Join by index
l = left.set_index('k'); r = right.set_index('k')
l.join(r, how='outer')

In [None]:
# Concatenate vertically / horizontally
(pd.concat([left, left], ignore_index=True), pd.concat([l, r], axis=1))

In [None]:
# Combine / combine_first
A = pd.DataFrame({'x':[1, None, 3]})
B = pd.DataFrame({'x':[9, 8, None]})
A.combine_first(B)

## 12) GroupBy & aggregation

In [None]:
g = sales.groupby('city')
(
    g['rev'].sum(),
    g.agg(rev_sum=('rev','sum'), rev_mean=('rev','mean'))
)

In [None]:
# Multiple keys & functions
sales.groupby(['city','year']).agg(
  rev_sum=('rev','sum'),
  rev_med=('rev','median'),
)

In [None]:
# Transform vs. apply
sales.assign(zscore=lambda d: (d['rev'] - d.groupby('city')['rev'].transform('mean'))
                               / d.groupby('city')['rev'].transform('std'))

In [None]:
# Value counts per group
sales.groupby('city')['rev'].value_counts()

## 13) Rolling / Expanding / EWM

In [None]:
ts = pd.DataFrame({'t':pd.date_range('2024-01-01', periods=6, freq='D'),
                   'x':[1,2,3,4,5,6]}).set_index('t')
# Rolling window (size=3)
(ts['x'].rolling(3).mean(), ts.rolling(window='3D').sum())

In [None]:
# Expanding from the start
ts['x'].expanding(min_periods=1).mean()

In [None]:
# Exponentially-weighted
ts['x'].ewm(span=3, adjust=False).mean()

## 14) Time series helpers

In [None]:
ts = ts.asfreq('D')                      # set (or convert to) daily frequency
# Label-based time slicing
ts.loc['2024-01-02':'2024-01-04']

In [None]:
# At/Between specific times (when index is datetime)
intraday = pd.DataFrame({'v':range(5)},
            index=pd.date_range('2024-01-01 09:00', periods=5, freq='30min'))
(intraday.at_time('09:30'), intraday.between_time('09:30','10:00'))

In [None]:
# Resampling
intraday.resample('1H').agg({'v':'sum'})

## 15) I/O & serialization

> ⚠️ File operations write to your current working directory in the runtime. Uncomment to execute.

In [None]:
# CSV
# df.to_csv('out.csv', index=False)
# pd.read_csv('out.csv').head()

# Parquet / Feather (install pyarrow if needed)
# df.to_parquet('out.parquet')
# pd.read_parquet('out.parquet').head()

# df.to_feather('out.feather')
# pd.read_feather('out.feather').head()

# Excel
# df.to_excel('out.xlsx', sheet_name='Data', index=False)
# pd.read_excel('out.xlsx').head()

# JSON / HTML / SQL (requires connector)
# df.to_json('out.json', orient='records')
# df.to_html('out.html')
# pd.read_sql('SELECT 1', con)  # requires a DB connection

# Pickle (Python-specific, not cross-language)
# df.to_pickle('out.pkl')
# pd.read_pickle('out.pkl').head()

## 16) Index operations

In [None]:
df_idx = sales.copy()
# Create / mutate index
df_idx = df_idx.set_index(['city','year']).sort_index().rename_axis(index=['City','Year'])
df_idx.head()

In [None]:
# Cross-section by index level
(df_idx.xs('Paris', level='City'), df_idx.xs(2024, level='Year'))

In [None]:
# Reindex & align
dates = pd.date_range('2024-01-01', periods=3, freq='Y')
pd.DataFrame({'v':[1,2,3]}, index=dates).reindex(
    pd.date_range('2023-01-01', periods=5, freq='Y'),
    fill_value=0)

In [None]:
# Swap / reorder levels
(
    df_idx.swaplevel('City','Year').sort_index().head(),
    df_idx.reorder_levels(['Year','City']).sort_index().head()
)

## 17) Misc. utilities

In [None]:
# Elementwise mapping (pandas ≥ 2.1: use .map instead of .applymap)
df_num.map(lambda x: x * 10)

In [None]:
# Apply along axis
(df_num.apply(np.sum, axis=0), df_num.apply(np.sum, axis=1))

In [None]:
# Pipe for method chaining
(df_num
    .pipe(lambda d: d.assign(total=d.sum(axis=1)))
    .query('total > 3')
    .sort_values('total'))

In [None]:
# Explode list-like column
df_list = pd.DataFrame({'a':[1,2], 'tags':[['x','y'], ['y']]})
df_list.explode('tags', ignore_index=True)

In [None]:
# Evaluate string expressions
df_eval = pd.DataFrame({'a':[1,2],'b':[3,4]})
df_eval.eval('c = a + b')

In [None]:
# Where / mask
df_num.where(df_num > 2, other=np.nan), df_num.mask(df_num % 2 == 0, other=-1)

## 18) Styling (quick)

> Styling is for presentation only; avoid using it inside computation chains.

In [None]:
styled = (df.style
            .format({'pop_k':'{:,}'})
            .highlight_max(subset=['pop_k'], color='#fde68a')
            .hide(axis='index'))
styled

---
### References
- Pandas DataFrame API: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html  
- Indexing/Selecting: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html  
- Full API Reference: https://pandas.pydata.org/pandas-docs/stable/reference/index.html