In [None]:
1. Introduction to Pandas

What is Pandas?
Pandas is an open-source data manipulation and analysis library for Python. It provides data 
structures and functions needed to work with structured data seamlessly and intuitively.

Installation
You can install Pandas using pip:

pip install pandas

Basic Data Structures

Series: A one-dimensional labeled array capable of holding any data type.
DataFrame: A two-dimensional labeled data structure with columns of potentially different types.



In [None]:
2. Data Structures
Series
A Series is similar to a column in a table or an Excel spreadsheet. It has both a data array and an 
index array.

import pandas as pd

# Creating a Series
data = [1, 2, 3, 4, 5]
s = pd.Series(data)

# Display the Series
print(s)

# Accessing elements
print(s[0])  # Output: 1

# Using custom index
s = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print(s['a'])  # Output: 1


In [None]:
DataFrame
A DataFrame is a 2D structure with labeled axes (rows and columns). It's similar to a table in a 
database or a spreadsheet.

# Creating a DataFrame
data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'San Francisco', 'Los Angeles']
}
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

# Accessing columns
print(df['name'])  # Output: Series with names

# Accessing rows by index
print(df.loc[0])  # Output: First row of the DataFrame


In [None]:
3. DataFrame Basics

Creating DataFrames

You can create DataFrames from dictionaries, lists, numpy arrays, CSV files, Excel files, and more.

# From a dictionary
data = {
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35],
    'city': ['New York', 'San Francisco', 'Los Angeles']
}
df = pd.DataFrame(data)

# From a list of lists
data = [
    ['Alice', 25, 'New York'],
    ['Bob', 30, 'San Francisco'],
    ['Charlie', 35, 'Los Angeles']
]
df = pd.DataFrame(data, columns=['name', 'age', 'city'])

# From a CSV file
df = pd.read_csv('data.csv')

# From an Excel file
df = pd.read_excel('data.xlsx')



In [None]:
Indexing and Selecting Data
Pandas provides several ways to index and select data, such as .loc, .iloc, .at, and .iat.

# Using loc (label-based)
print(df.loc[0])  # First row

# Using iloc (integer-based)
print(df.iloc[0])  # First row

# Using at (label-based, scalar)
print(df.at[0, 'name'])  # First row, 'name' column

# Using iat (integer-based, scalar)
print(df.iat[0, 0])  # First row, first column


In [None]:
Adding and Modifying Columns
You can easily add new columns or modify existing ones.

# Adding a new column
df['salary'] = [50000, 60000, 70000]

# Modifying an existing column
df['age'] = df['age'] + 1


In [None]:
Dropping Data
You can drop rows or columns using the drop method.

# Dropping a column
df = df.drop(columns=['salary'])

# Dropping a row
df = df.drop(index=[0])


In [None]:
Renaming Columns and Index
You can rename columns and index using the rename method.

df = df.rename(columns={'name': 'full_name'})
df = df.rename(index={0: 'first_row'})


In [None]:
4. Data Inspection
Viewing Data
Pandas provides convenient methods to view your data.

# Display the first few rows
print(df.head())

# Display the last few rows
print(df.tail())


In [None]:
Info and Description
You can get an overview of your DataFrame using info and describe.

# Info
print(df.info())

# Description
print(df.describe())


In [None]:
Shape and Dimensions
You can get the shape and dimensions of your DataFrame.

# Shape
print(df.shape)

# Dimensions
print(df.ndim)

# Size
print(df.size)


In [None]:
Checking for Missing Values
You can check for missing values in your DataFrame.

# Check for missing values
print(df.isna())
print(df.isna().sum())


In [None]:
Value Counts
You can get the count of unique values in a column.

# Value counts
print(df['city'].value_counts())


In [None]:
5. Data Manipulation
Handling Missing Data
Detecting Missing Data
You can detect missing data using isna and isnull.

# Check for missing values
print(df.isna())
print(df.isnull())


In [None]:
Filling Missing Data
You can fill missing values using fillna.

# Fill missing values with a specific value
df['age'] = df['age'].fillna(0)


In [None]:
Dropping Missing Data
You can drop rows or columns with missing values using dropna.

# Drop rows with missing values
df = df.dropna()


In [None]:
Replacing Values
You can replace specific values using replace.

# Replace specific values
df['city'] = df['city'].replace('New York', 'NYC')


In [None]:
Filtering and Sorting
Filtering Data
You can filter data based on conditions.

# Filtering rows
filtered_df = df[df['age'] > 30]
print(filtered_df)


In [None]:
Sorting Data
You can sort your DataFrame by columns using sort_values and by index using sort_index.

# Sort by age
df = df.sort_values(by='age')

# Sort by index
df = df.sort_index()


In [None]:
Conditional Selection
You can use Boolean indexing to select data conditionally.

# Conditional selection
young_people = df[df['age'] < 30]


In [None]:
Grouping and Aggregation
Group By
You can group data by columns using groupby.

# Group by city
grouped = df.groupby('city')


In [None]:
Aggregating Data
You can aggregate data using functions like mean, sum, min, max, etc.

# Calculate mean age for each city
print(grouped['age'].mean())


In [None]:
Applying Functions
You can apply custom functions to groups using apply and agg.

# Custom aggregation
print(grouped['age'].agg(['mean', 'min', 'max']))


In [None]:
6. Advanced Data Operations
Merging and Joining
Concatenation
You can concatenate DataFrames using concat.

# Concatenating DataFrames
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
result = pd.concat([df1, df2])


In [None]:
Merging DataFrames
You can merge DataFrames using merge.

# Merging DataFrames on a common column
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
merged_df = pd.merge(df1, df2, on='key')
print(merged_df)


In [None]:
Joining DataFrames
You can join DataFrames using join.

# Joining DataFrames using the index
df1 = df1.set_index('key')
df2 = df2.set_index('key')
joined_df = df1.join(df2, lsuffix='_left', rsuffix='_right')
print(joined_df)


In [None]:
Pivoting and Reshaping
Pivot Tables
You can create pivot tables using pivot_table.

# Pivot table
pivot = df.pivot_table(values='age', index='city', aggfunc='mean')
print(pivot)


In [None]:
Crosstabulation
You can create crosstabulations using crosstab.

# Crosstabulation
crosstab = pd.crosstab(df['name'], df['city'])
print(crosstab)


In [None]:
Melt and Stack
You can reshape your DataFrame using melt, stack, and unstack.

# Melting DataFrame
melted = pd.melt(df, id_vars=['name'], value_vars=['age', 'city'])
print(melted)

# Stacking and Unstacking
stacked = df.stack()
unstacked = stacked.unstack()
print(stacked)
print(unstacked)


In [None]:
Time Series
Date Range
You can create a range of dates using date_range.

# Creating a date range
date_rng = pd.date_range(start='2020-01-01', end='2020-01-10', freq='D')
print(date_rng)


In [None]:
Indexing with Dates
You can convert strings to datetime objects and use them as index.

# Converting to datetime
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')



In [None]:
Resampling
You can resample time series data.

# Resampling data by week
weekly_data = df.resample('W').mean()
print(weekly_data)


In [None]:
Time-based Grouping
You can group data by time periods.

# Grouping by month
monthly_data = df.resample('M').mean()
print(monthly_data)


In [None]:
7. Input/Output Operations
Reading Data
CSV Files
You can read CSV files using read_csv.


# Reading a CSV file
df = pd.read_csv('data.csv')


In [None]:
Excel Files
You can read Excel files using read_excel.

# Reading an Excel file
df = pd.read_excel('data.xlsx')


In [None]:
JSON Files
You can read JSON files using read_json.

# Reading a JSON file
df = pd.read_json('data.json')


In [None]:
SQL Databases
You can read data from SQL databases using read_sql.

import sqlite3

# Reading from an SQL database
conn = sqlite3.connect('database.db')
df = pd.read_sql('SELECT * FROM table_name', conn)


In [None]:
HTML Tables
You can read HTML tables using read_html.

# Reading an HTML table
url = 'https://example.com/table.html'
dfs = pd.read_html(url)
print(dfs[0])


In [None]:
Writing Data
CSV Files
You can write data to CSV files using to_csv.

# Writing to a CSV file
df.to_csv('output.csv', index=False)


In [None]:
Excel Files
You can write data to Excel files using to_excel.

# Writing to an Excel file
df.to_excel('output.xlsx', index=False)


In [None]:
JSON Files
You can write data to JSON files using to_json.

# Writing to a JSON file
df.to_json('output.json')


In [None]:
SQL Databases
You can write data to SQL databases using to_sql.

# Writing to an SQL database
df.to_sql('table_name', conn, if_exists='replace', index=False)


In [None]:
8. Data Cleaning and Preparation
String Operations
You can use the .str accessor for vectorized string operations.

# String operations
df['city'] = df['city'].str.upper()


In [None]:
Handling Duplicates
You can detect and remove duplicates using duplicated and drop_duplicates.

# Detecting duplicates
print(df.duplicated())

# Removing duplicates
df = df.drop_duplicates()


In [None]:
Replacing Data
You can replace values in your DataFrame using replace.

# Replacing values
df['city'] = df['city'].replace('NEW YORK', 'NYC')


In [None]:
Cutting and Binning
You can use cut and qcut to bin data into intervals.

# Binning data
df['age_bin'] = pd.cut(df['age'], bins=[0, 18, 35, 60], labels=['child', 'young_adult', 'adult'])


In [None]:
Scaling Data
You can normalize and standardize your data.

from sklearn.preprocessing import StandardScaler

# Normalizing data
scaler = StandardScaler()
df[['age']] = scaler.fit_transform(df[['age']])



In [None]:
9. Data Visualization with Pandas
Plotting
You can create various plots using the plot method.

# Line plot
df['age'].plot(kind='line')

# Bar plot
df['age'].plot(kind='bar')

# Save the plot
plt.savefig('plot.png')


In [None]:
Histograms
You can create histograms to show the distribution of data.
# Histogram
df['age'].plot(kind='hist')


In [None]:
Box Plots
You can create box plots to show the distribution of data.
# Box plot
df.boxplot(column=['age'])


In [None]:
Scatter Plots
You can create scatter plots to show the relationship between two variables.

# Scatter plot
df.plot(kind='scatter', x='age', y='salary')


In [None]:
Visualizing Time Series Data
You can visualize time series data.

# Time series plot
df['data'].plot(kind='line')


In [None]:
10. Performance Optimization
Optimizing Memory Usage
You can reduce memory usage by optimizing data types.

# Optimizing data types
df['age'] = df['age'].astype('int8')
df['city'] = df['city'].astype('category')


In [None]:
Efficient Operations
You can perform operations using vectorized operations.

# Vectorized operations
df['age'] = df['age'] + 1


In [None]:
Parallel Processing
You can use Dask with Pandas for large datasets.

import dask.dataframe as dd

# Using Dask
ddf = dd.from_pandas(df, npartitions=3)
result = ddf.compute()


In [None]:
11. Pandas Extensions
Styling DataFrames
You can style DataFrames for better visual representation.

# Styling DataFrame
styled_df = df.style.applymap(lambda x: 'color: red' if x < 30 else 'color: green')


In [None]:
Pandas Extension Types
You can create custom data types.

import pandas as pd
import pandas.api.extensions

class CustomDtype(pandas.api.extensions.ExtensionDtype):
    pass

# Register the custom data type
pd.api.extensions.register_extension_dtype(CustomDtype)


In [None]:
Integration with Other Libraries
You can integrate Pandas with NumPy, Matplotlib, and Seaborn for advanced data analysis and visualization.

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Using NumPy with Pandas
df['random'] = np.random.randn(len(df))

# Using Seaborn with Pandas
sns.barplot(x='city', y='age', data=df)
plt.show()
