# Pandas Cheatsheet 

### Basics

In [None]:
# Imports
import pandas as pd

# File reading/writing
df = pd.read_csv("data.csv")
df.to_csv("output.csv")

# Access columns and rows
df['column']
df.iloc[0]
df.loc[0, 'column']

# Get help
help(pd.read_csv)

### Data Inspection

In [None]:
df.index # Index of the DataFrame
df.head() # Display first 5 rows
df.tail() # Display last 5 rows
df.shape # Returns (rows, columns)
df.columns # List column names
df.info() # Summary of the DataFrame
df.describe() # Statistical summary of numeric columns
df.dtypes # Data type of each column
df.isnull() # Missing values in each column
df.isna() # NaN values in each column
df["col"].any() # Check if any value in boolean column is True
df["col"].all() # Check if all values in boolean column are True
df.corr() # Correlation matrix
df['col'].unique() # Unique values in a column
df['col'].sum() # Sum of a column
df['col'].min() # Minimum of a column
df['col'].max() # Maximum of a column
df['col'].mean() # Mean of a column
df['col'].std() # Standard deviation of a column
df['col'].idxmax() # Get maximizing index

### Data Selection

In [None]:
df['column'] # Select column as Series
df[['col1', 'col2']] # Select multiple columns as DataFrame
df.iloc[0, 1] # Select by row and column index
df.loc[0, 'col'] # Select by label
df.iloc[0:3] # Select rows by index
df[df['col'] > 10] # Filter rows by condition (also with .loc)


# Filters

f = df['col'] > 10 # Returns a Series element with boolean values
df[f] # Selects rows by filter

# Combined filters
f = (df['col1'] > 10) & (df['col2'] == 'a') # logical 'and'
f = (df['col1'] > 10) | (df['col2'] == 'a') # logical 'or'

# Useful filter functions
f = df['col'].between(10, 20)
f = df['col'].isin(list_of_values)

### Data Manipulation

In [None]:
df['new_col'] = df['col1'] + df['col2'] # Add new column
df.rename(columns={'old_name': 'new_name'}) # Rename column
df.replace({1: 2, "a": "b"}) # Replace values
df.drop('col', axis=1) # Drop column
df.drop(1, axis=0) # Drop row
df.dropna() # Drop missing values
df.fillna(0) # Fill missing values
df.sort_values(by='col') # Sort DataFrame by column
df.apply(foo, axis=1) # Apply function across columns/rows
df['col'].apply(foo) # Apply function across values of a Series
df['col'].map(foo) # Apply function across values of a Series
df['col'].where(df['col'] > 0, 0) # Replace all values where cond. is False
df['col'].astype(int) # Convert column to integers
df['col'].astype('category') # Convert column to categorical numbers
df['col'].cat.codes # Convert categorical to numerical values
pd.cut(df['col'], 3) # Sort values into bins

### Data Aggregation

In [None]:
df.groupby('col') # Group by column 'col'
df.agg({'col': 'sum'}) # Aggregate using functions
df.value_counts(normalize=False) # Frequency count of values

### Merging and Joining

In [None]:
pd.merge(df1, df2, on='key', how='outer') # Merge dfs on a key column
df1.join(df2) # Join DataFrames on index
pd.concat([df1, df2]) # Concatenate DataFrames

### Dates and Time

In [None]:
# Convert values to Datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce", format="%Y-%m-%d")
# Receive information from Datetime
df["date"].dt.dayofweek
df["date"].dt.day
df["date"].dt.month
df["date"].dt.year
df["date"].dt.quarter

### Plotting

#### Pandas Plotting

In [None]:
df.plot(x='col1', y='col2') # Line plot
df['col'].plot(kind='hist') # Histogram
df.plot(x='col1', y=['col2', 'col3'], kind='bar') # Bar plot
df.plot(x='col1', y='col2', kind='scatter') # Scatter plot

#### Matplotlib Plotting

In [None]:
import matplotlib.pyplot as plt

# Line plot
plt.plot(df['col1'], df['col2'])

# Title, annotations, and prettifications
plt.title("Title")
plt.xlabel("X-axis label")
plt.ylabel("Y-axis label")
plt.grid()
plt.legend()

# Save plot
plt.savefig("plot.png")