# Pandas 🐼 overview

## Create DataFrame from scratch

In [None]:
import pandas as pd
data = {
    'Bedrooms': [3, 2, 4, 3],
    'Size_sqft': [2000, 1500, 2500, 1800],
    'Age_years': [10, 20, 5, 15]
}

df = pd.DataFrame(data)
print(df.head())
print(type(df))

## Create Series

In [None]:
# Data for house prices
prices = [500000, 400000, 650000, 550000]

# Create the Series from scratch
price_series = pd.Series(prices, name='Price')

# Display the Series
print(price_series)

## Importing and loading data

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# Load the Palmer Penguins dataset
penguins = sns.load_dataset('penguins')

## Data inspection and summarisation

In [None]:
# Display the first few rows of the dataset
penguins.head()

In [None]:
# Display the last 5 rows
penguins.tail(10)

### Check data types

In [None]:
# Check the data types of each column
penguins.dtypes

### Get info of the DataFrame

In [None]:
# Get a concise summary of the DataFrame
penguins.info()

### Get quick summary statistics

In [None]:
# Get summary statistics for numerical columns
penguins.describe()

### List column names

In [None]:
# List all the column names
penguins.columns

### Check the shape of the data

In [None]:
# Check the number of rows and columns in the DataFrame
penguins.shape

## Selecting, Filtering and Subsetting Data

### Select a single column

In [None]:
# Select a single column (species)
penguins['species']

### Select multiple columns

In [None]:
# Select multiple columns
penguins[['species', 'island', 'bill_length_mm']]

### Filtering rows based on a condition

In [None]:
# Filter rows where bill_length_mm is greater than 45
filtered_penguins = penguins[penguins['bill_length_mm'] > 45]
filtered_penguins

### Filter rows based on multiple conditions with `loc`





Operators:
- Use & for combining multiple conditions with a logical AND.
- Use | for combining multiple conditions with a logical OR.
- Use ~ to negate a condition (logical NOT).

#### AND filtering

In [None]:
# Filter penguins where species is "Adelie", island is "Dream", and bill_length_mm > 40
filtered_penguins = penguins.loc[
    (penguins['species'] == 'Adelie') &
    (penguins['island'] == 'Dream') &
    (penguins['bill_length_mm'] > 40)
]

filtered_penguins

In [None]:
from matplotlib import pyplot as plt
filtered_penguins['bill_length_mm'].plot(kind='hist', bins=20, title='bill_length_mm')
plt.gca().spines[['top', 'right',]].set_visible(False)

#### OR filtering

In [None]:
# Filter penguins where species is "Adelie" or bill_length_mm > 45
filtered_penguins_or = penguins.loc[
    (penguins['species'] == 'Adelie') |
    (penguins['bill_length_mm'] > 45)
]

filtered_penguins_or


#### NOT condition

In [None]:
# Filter penguins where species is not "Gentoo"
filtered_penguins_not = penguins.loc[
    ~(penguins['species'] == 'Gentoo')
]

filtered_penguins_not

### Filter rows with `iloc`

In [None]:
# Select rows from index position 20 to 30, and columns 0, 1, and 2 using iloc
subset_penguins = penguins.iloc[20:31, [0, 1, 2]]
subset_penguins

### Filtering rows with `numpy.where()`

In [None]:
import numpy as np

indices = np.where(
     (penguins['species'] == 'Adelie') |
     (penguins['bill_length_mm'] > 45)
     )
# Use these indices to filter the DataFrame
filtered_penguins_or_numpy = penguins.iloc[indices]
filtered_penguins_or_numpy

## Sorting and Ranking

### Sort by a single column



In [None]:
# Sort by bill_length_mm in descending order
sorted_penguins = penguins.sort_values(by='bill_length_mm', ascending=False)
sorted_penguins

### Sort by multiple columns

In [None]:
# Sort by species and then by bill_length_mm
sorted_penguins = penguins.sort_values(by=['species', 'bill_length_mm'],
                                       ascending=[True, False])
sorted_penguins


## Handling Missing Data

### Check for missing data

In [None]:
penguins.isnull().sum()

### Drop rows with missing values

In [None]:
# Drop rows with missing values
penguins_dropped = penguins.dropna()
penguins_dropped

### Imputing missing values (filling them in)

In [None]:
# Fill missing values with the mean (for numerical columns)
penguins['bill_length_mm'] = penguins['bill_length_mm'].fillna(penguins['bill_length_mm'].mean())
penguins

## Renaming columns

In [None]:
# Rename the column 'bill_length_mm' to 'bill_length'
new_names = {
    'bill_length_mm': 'bill_length',
    'bill_depth_mm':'bill_depth'}

penguins_renamed = penguins.rename(columns=new_names)
print(penguins_renamed.head())

In [None]:
if isinstance(new_names, dict):
    print("New names is a `dictionary` structure type")

## Grouping and aggregating data

### Group and aggregate by single column

In [None]:
# Group by species and calculate the mean for each numeric column
species_grouped = penguins.groupby('species').mean(numeric_only=True)
print(species_grouped)

### Group by multiple columns and aggregate

In [None]:
# Group by species and island and calculate the mean
grouped = penguins.groupby(['species', 'island']).mean(numeric_only=True)
print(grouped)

## Merging and joining DataFrames

In [None]:
# Sample DataFrame for merging
extra_data = pd.DataFrame({
    'species': ['Adelie', 'Chinstrap', 'Gentoo'],
    'endangered_status': ['Least Concern', 'Near Threatened', 'Least Concern']
})

merged_penguins = pd.merge(penguins, extra_data, on='species', how='left')
merged_penguins


## Applying functions to DataFrame with `apply()`

### Applying a Lambda function

In [None]:
# Apply a lambda function to create a new column that classifies penguins by bill length
penguins['bill_category'] = penguins['bill_length_mm'].apply(lambda
                                                             x: 'Long' if x > 45 else 'Short')
penguins


### Applying a custom function

In [None]:
# Define a custom function
def mass_flag(mass):
    if mass > 4000:
        return 'Heavy'
    else:
        return 'Light'

# Apply the custom function to the 'body_mass_g' column
penguins['mass_category'] = penguins['body_mass_g'].apply(mass_flag)
print(penguins.head())


## Adding and removing columns

### Add a new column

In [None]:
# Add a new column for flipper length in inches (1 mm = 0.03937 inches)
penguins['flipper_length_inch'] = penguins['flipper_length_mm'] * 0.03937
print(penguins.head())

### Drop a column

In [None]:
# Drop the 'bill_category' column
penguins_dropped_col = penguins.drop('bill_category', axis=1)
print(penguins_dropped_col.head())

## Saving and Loading data

### Saving to CSV

In [None]:
# Save the penguins DataFrame to a CSV file
penguins.to_csv('penguins_data.csv', index=False)

### Loading Saved CSV

In [None]:
# Read the penguins data from a CSV file
penguins_from_csv = pd.read_csv('penguins_data.csv')
print(penguins_from_csv.head())