In [1]:
import pandas as pd
print(pd.__version__)

2.2.3


In [2]:
import pandas as pd
import numpy as np

# Create a Series from a list
data = [10, 20, 30, 40]
series = pd.Series(data, index=['a', 'b', 'c', 'd'])
print(series)
# Output:
# a    10
# b    20
# c    30
# d    40
# dtype: int64

a    10
b    20
c    30
d    40
dtype: int64


In [3]:
print(series['b'])  # Output: 20

20


In [4]:
# Create a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'London', 'Tokyo']
}
df = pd.DataFrame(data)
print(df)
# Output:
#       Name  Age     City
# 0    Alice   25  New York
# 1      Bob   30   London
# 2  Charlie   35    Tokyo

      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   35     Tokyo


In [5]:
print(df['Name'])  # Output: Series with names
print(df.loc[1])   # Output: Row for Bob
print(df['Age'][2])  # Output: 35

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object
Name       Bob
Age         30
City    London
Name: 1, dtype: object
35


In [7]:
# Example: Loading a CSV file
# Assuming 'data.csv' contains: Name,Age,City
# Alice,25,New York
# Bob,30,London
# Charlie,35,Tokyo

df = pd.read_csv('data.csv')
print(df.head())  # Display first 5 rows

data = {
    'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop'],
    'Price': [1000, 500, 300, 1200],
    'Quantity': [5, 10, 8, 3],
    'Category': ['Electronics', 'Electronics', 'Electronics', 'Electronics']
}
df = pd.DataFrame(data)
print(df)
# Output:
#   Product  Price  Quantity   Category
# 0  Laptop   1000         5  Electronics
# 1   Phone    500        10  Electronics
# 2  Tablet    300         8  Electronics
# 3  Laptop   1200         3  Electronics

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [8]:
# Add missing values to our DataFrame
df.loc[2, 'Price'] = np.nan
print(df)
# Output:
#   Product  Price  Quantity   Category
# 0  Laptop  1000.0        5  Electronics
# 1   Phone   500.0       10  Electronics
# 2  Tablet     NaN        8  Electronics
# 3  Laptop  1200.0        3  Electronics

# Fill missing values with the mean
df['Price'].fillna(df['Price'].mean(), inplace=True)
print(df)
# Output:
#   Product  Price  Quantity   Category
# 0  Laptop  1000.0        5  Electronics
# 1   Phone   500.0       10  Electronics
# 2  Tablet   900.0        8  Electronics
# 3  Laptop  1200.0        3  Electronics

      Name  Age      City  Price
0    Alice   25  New York    NaN
1      Bob   30    London    NaN
2  Charlie   35     Tokyo    NaN
      Name  Age      City  Price
0    Alice   25  New York    NaN
1      Bob   30    London    NaN
2  Charlie   35     Tokyo    NaN


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Price'].fillna(df['Price'].mean(), inplace=True)


In [9]:
# Check for duplicate rows
print(df.duplicated().sum())  # Output: 0

# Add a duplicate row
df = pd.concat([df, df.loc[[0]]], ignore_index=True)
print(df)
# Output:
#   Product  Price  Quantity   Category
# 0  Laptop  1000.0        5  Electronics
# 1   Phone   500.0       10  Electronics
# 2  Tablet   900.0        8  Electronics
# 3  Laptop  1200.0        3  Electronics
# 4  Laptop  1000.0        5  Electronics

# Remove duplicates
df = df.drop_duplicates()
print(df)
# Output:
#   Product  Price  Quantity   Category
# 0  Laptop  1000.0        5  Electronics
# 1   Phone   500.0       10  Electronics
# 2  Tablet   900.0        8  Electronics
# 3  Laptop  1200.0        3  Electronics

0
      Name  Age      City  Price
0    Alice   25  New York    NaN
1      Bob   30    London    NaN
2  Charlie   35     Tokyo    NaN
3    Alice   25  New York    NaN
      Name  Age      City  Price
0    Alice   25  New York    NaN
1      Bob   30    London    NaN
2  Charlie   35     Tokyo    NaN


In [10]:
# Filter rows where Price > 600
high_price = df[df['Price'] > 600]
print(high_price)
# Output:
#   Product  Price  Quantity   Category
# 0  Laptop  1000.0        5  Electronics
# 2  Tablet   900.0        8  Electronics
# 3  Laptop  1200.0        3  Electronics

Empty DataFrame
Columns: [Name, Age, City, Price]
Index: []


In [11]:
# Sort by Price (descending)
sorted_df = df.sort_values(by='Price', ascending=False)
print(sorted_df)
# Output:
#   Product  Price  Quantity   Category
# 3  Laptop  1200.0        3  Electronics
# 0  Laptop  1000.0        5  Electronics
# 2  Tablet   900.0        8  Electronics
# 1   Phone   500.0       10  Electronics

      Name  Age      City  Price
0    Alice   25  New York    NaN
1      Bob   30    London    NaN
2  Charlie   35     Tokyo    NaN


In [12]:
# Group by Product and calculate mean Price
grouped = df.groupby('Product')['Price'].mean()
print(grouped)
# Output:
# Product
# Laptop    1100.0
# Phone      500.0
# Tablet     900.0
# Name: Price, dtype: float64

KeyError: 'Product'

In [13]:
# Create another DataFrame
data2 = pd.DataFrame({
    'Product': ['Laptop', 'Phone'],
    'Rating': [4.5, 4.0]
})

# Merge with original DataFrame
merged_df = pd.merge(df, data2, on='Product', how='left')
print(merged_df)
# Output:
#   Product  Price  Quantity   Category  Rating
# 0  Laptop  1000.0        5  Electronics     4.5
# 1   Phone   500.0       10  Electronics     4.0
# 2  Tablet   900.0        8  Electronics     NaN
# 3  Laptop  1200.0        3  Electronics     4.5

KeyError: 'Product'

In [14]:
# Create a sales DataFrame
sales_data = pd.DataFrame({
    'Date': ['2025-01-01', '2025-01-01', '2025-01-02', '2025-01-02'],
    'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop'],
    'Price': [1000, 500, 300, 1200],
    'Quantity': [2, 5, 3, 1]
})

# Calculate total revenue per sale
sales_data['Revenue'] = sales_data['Price'] * sales_data['Quantity']
print(sales_data)
# Output:
#         Date Product  Price  Quantity  Revenue
# 0  2025-01-01  Laptop   1000         2     2000
# 1  2025-01-01   Phone    500         5     2500
# 2  2025-01-02  Tablet    300         3      900
# 3  2025-01-02  Laptop   1200         1     1200

# Group by Date to find total revenue
daily_revenue = sales_data.groupby('Date')['Revenue'].sum()
print(daily_revenue)
# Output:
# Date
# 2025-01-01    4500
# 2025-01-02    2100
# Name: Revenue, dtype: int64

# Filter high-revenue sales (>1000)
high_revenue = sales_data[sales_data['Revenue'] > 1000]
print(high_revenue)
# Output:
#         Date Product  Price  Quantity  Revenue
# 0  2025-01-01  Laptop   1000         2     2000
# 1  2025-01-01   Phone    500         5     2500

         Date Product  Price  Quantity  Revenue
0  2025-01-01  Laptop   1000         2     2000
1  2025-01-01   Phone    500         5     2500
2  2025-01-02  Tablet    300         3      900
3  2025-01-02  Laptop   1200         1     1200
Date
2025-01-01    4500
2025-01-02    2100
Name: Revenue, dtype: int64
         Date Product  Price  Quantity  Revenue
0  2025-01-01  Laptop   1000         2     2000
1  2025-01-01   Phone    500         5     2500
3  2025-01-02  Laptop   1200         1     1200
