# Pandas 
An essential library in Python for data manipulation and analysis. It provides powerful data structures like DataFrame and Series to work with structured data efficiently. 

In [1]:
import pandas as pd
import numpy as np

In [4]:
#create a synthetic dataset (initially a dictionary) that represents sales data for a retail store
data = {
    'Product': ['Apples', 'Oranges', 'Bananas', 'Strawberries', 'Grapes', 'Blueberries'],
    'Sales': [100, 150, 200, 140, 120, 150],
    'Price': [1.00, 1.50, 0.80, 2.00, 1.75, 2.50]
}

# Create DataFrame (creating the dataset)
df = pd.DataFrame(data)

# Display the DataFrame
print(df)

        Product  Sales  Price
0        Apples    100   1.00
1       Oranges    150   1.50
2       Bananas    200   0.80
3  Strawberries    140   2.00
4        Grapes    120   1.75
5   Blueberries    150   2.50


# Loading and Saving Data
Pandas can load and save data in various formats. While we created our dataset from scratch, usually, we'd load it from a file:

In [6]:
# save the dataset to csv
df.to_csv('sales_data.csv', index = False)

In [7]:
# load the dataset
df = pd.read_csv('sales_data.csv')

Pandas supports various other formats like Excel (read_excel, to_excel), JSON (read_json, to_json), SQL (read_sql, to_sql), etc.

# Basic DataFrame Operations

## Viewing and inspecting data

In [22]:
#get column info
df.columns

Index(['Product', 'Sales', 'Price'], dtype='object')

In [10]:
#display the first few rows
df.head()

Unnamed: 0,Product,Sales,Price
0,Apples,100,1.0
1,Oranges,150,1.5
2,Bananas,200,0.8
3,Strawberries,140,2.0
4,Grapes,120,1.75


In [11]:
#display last few rows
df.tail()

Unnamed: 0,Product,Sales,Price
1,Oranges,150,1.5
2,Bananas,200,0.8
3,Strawberries,140,2.0
4,Grapes,120,1.75
5,Blueberries,150,2.5


In [16]:
#get summary statistics
df.describe()

Unnamed: 0,Sales,Price
count,6.0,6.0
mean,143.333333,1.591667
std,33.862467,0.632785
min,100.0,0.8
25%,125.0,1.125
50%,145.0,1.625
75%,150.0,1.9375
max,200.0,2.5


In [18]:
#Get general information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Product  6 non-null      object 
 1   Sales    6 non-null      int64  
 2   Price    6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes


## Selection, Filtering, and Sorting


In [24]:
#Select a single column
sale_data = df['Sales']
sale_data

0    100
1    150
2    200
3    140
4    120
5    150
Name: Sales, dtype: int64

In [27]:
#Select multiple columns
product_price = df[['Product','Price']]
product_price

Unnamed: 0,Product,Price
0,Apples,1.0
1,Oranges,1.5
2,Bananas,0.8
3,Strawberries,2.0
4,Grapes,1.75
5,Blueberries,2.5


In [36]:
#Filter rows
high_price = df[df['Price'] > 1.5]
high_price

Unnamed: 0,Product,Sales,Price
3,Strawberries,140,2.0
4,Grapes,120,1.75
5,Blueberries,150,2.5


In [39]:
#Sort values
sorted_data = df.sort_values(by = 'Price', ascending = False)
sorted_data

Unnamed: 0,Product,Sales,Price
5,Blueberries,150,2.5
3,Strawberries,140,2.0
4,Grapes,120,1.75
1,Oranges,150,1.5
0,Apples,100,1.0
2,Bananas,200,0.8


# Data Manipulation

## Add a new column

In [41]:
df['Revenue'] = df['Sales'] * df['Price']
df.head(3)

Unnamed: 0,Product,Sales,Price,Revenue
0,Apples,100,1.0,100.0
1,Oranges,150,1.5,225.0
2,Bananas,200,0.8,160.0


# Apply function to a column

In [42]:
df['Price_rounded'] = df['Price'].round()
df.head(3)

Unnamed: 0,Product,Sales,Price,Revenue,Price_rounded
0,Apples,100,1.0,100.0,1.0
1,Oranges,150,1.5,225.0,2.0
2,Bananas,200,0.8,160.0,1.0


# Deleting a column

In [43]:
df.drop('Price_rounded', axis = 1, inplace = True)
df.head(3)

Unnamed: 0,Product,Sales,Price,Revenue
0,Apples,100,1.0,100.0
1,Oranges,150,1.5,225.0
2,Bananas,200,0.8,160.0


# Renaming a column

In [46]:
df.rename(columns = {'Product':'Item'}, inplace = True)
df.head(3)

Unnamed: 0,Item,Sales,Price,Revenue
0,Apples,100,1.0,100.0
1,Oranges,150,1.5,225.0
2,Bananas,200,0.8,160.0
