# Module: Pandas Assignments
## Lesson: Pandas



In [1]:
# Assignment 1: DataFrame Creation and Indexing

#* 1. Create a Pandas DataFrame with 4 columns and 6 rows filled with random integers. Set the index to be the first column.

import pandas as pd
import numpy as np

data = np.random.randint(1, 10, (6,4))
index = data[:, 0:1]
df = pd.DataFrame(data, index=index.flatten())
df

Unnamed: 0,0,1,2,3
7,7,6,1,8
6,6,9,8,7
4,4,1,6,2
2,2,7,9,4
2,2,1,7,3
6,6,9,2,2


In [2]:
#* 2. Create a Pandas DataFrame with columns 'A', 'B', 'C' and index 'X', 'Y', 'Z'. Fill the DataFrame with random integers and access the element at row 'Y' and column 'B'

data = np.random.randint(1, 10, (3,3))
df = pd.DataFrame(data, columns=['A', 'B', 'C'], index=['X', 'Y', 'Z'])
print(df)
df.loc['Y', 'B']

   A  B  C
X  7  5  5
Y  7  4  9
Z  4  4  9


np.int32(4)

In [3]:
# Assignment 2: DataFrame Operations

#* 1. Create a Pandas DataFrame with 3 columns and 5 rows filled with random integers. Add a new column that is the product of the first two columns.

np.random.seed(42)
data = np.random.randint(1, 15, (5,3))
df = pd.DataFrame(data)
df['product'] = (df[0] * df[1])
print(df)

    0  1   2  product
0   7  4  13       28
1  11  8  13       88
2   5  7  10       35
3   3  7  11       21
4  11  8   5       88


In [4]:
#* Create a Pandas DataFrame with 3 columns and 4 rows filled with random integers. Compute the column wise and row-wise sum.

df = pd.DataFrame(np.random.randint(1,9, (3,3)), columns=['A', 'B', 'C'])

print(df)

print(f'Row-wise sum:\n', np.sum(df, axis=0))
print(f'Column-wise sum:\n' , np.sum(df, axis=1))


   A  B  C
0  4  8  8
1  3  6  5
2  2  8  4
Row-wise sum:
 A     9
B    22
C    17
dtype: int64
Column-wise sum:
 0    20
1    14
2    14
dtype: int64


In [5]:
# Assignment 3

#* 1. Create a Pandas DataFrame with 3 columns and 5 rows filled with random integers. Introduce some NaN values. Fill the NaN values with the mean of the respective columns

np.random.seed(42)
df = pd.DataFrame(np.random.randint(1, 10, (5, 3)), columns=['A', 'B', 'C'])
df.loc[2, 'A'] = None
df.loc[4, 'B'] = None
df.loc[3, 'C'] = None
for column in df:
    df[column].fillna(df[column].mean(), inplace=True)

print(df['A'].mean(),
df['B'].mean(),
df['C'].mean())

df


4.75 6.75 5.25


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)


Unnamed: 0,A,B,C
0,7.0,4.0,8.0
1,5.0,7.0,3.0
2,4.75,8.0,5.0
3,4.0,8.0,5.25
4,3.0,6.75,5.0


In [6]:
#* 2. Create a Pandas DataFrame with 4 columns and 6 rows filled with random integers. Introduce some NaN values. Drop the rows with any NaN values.

df = pd.DataFrame(np.random.randint(1, 9, (6, 4)), columns=['A', 'B', 'C', 'D'])

df.loc[2, 'A'] = None
df.loc[4, 'B'] = None
df.loc[3, 'C'] = None
df.loc[6, 'D'] = None

df.dropna(inplace=True, axis=0)

df

Unnamed: 0,A,B,C,D
0,2.0,8.0,4.0,6.0
1,6.0,2.0,8.0,4.0
5,2.0,8.0,4.0,4.0


In [7]:
# Assignment 4: Data Aggregation

# 1. Create a Pandas DataFrame with 2 columns: 'Category' and 'Value'. Fill the 'Category' column with random categories ('A', 'B', 'C') and the 'Value' column with random integers. Group the DataFrae by 'Category' and compute the sum and mean of 'Value' for each column.

df = pd.DataFrame({
    'Category': ['A', 'B', 'C', 'D', 'E'],
    'Value': [32, 43, 23, 41, 54]
})

mean = df.groupby('Category')['Value'].mean()
sum = df.groupby('Category')['Value'].sum()

print(mean, sum)

df

Category
A    32.0
B    43.0
C    23.0
D    41.0
E    54.0
Name: Value, dtype: float64 Category
A    32
B    43
C    23
D    41
E    54
Name: Value, dtype: int64


Unnamed: 0,Category,Value
0,A,32
1,B,43
2,C,23
3,D,41
4,E,54


In [17]:
#* 2. Create a Pandas DataFrame with 3 columns. 'product', 'category', and 'sales'. Fill the DataFrame with random data. Group the DataFrame by Category and compute the total sales for each category.

#! sample list for random selection
products = ["Laptop", "Phone", "Tablet", "Headphones", "Monitor", "Keyboard", "Mouse"]
categories = ["Electronics", "Accessories", "Groceries", "Fitness"]

# Create a DataFrame
df = pd.DataFrame({
    'products': np.random.choice(products, size=10),
    'category': np.random.choice(categories, size=10),
    'sales': np.random.randint(100, 5000, size=10)
})

df.groupby('category')['sales'].sum()

category
Accessories    8716
Electronics    8557
Fitness        8470
Groceries      3891
Name: sales, dtype: int32

In [18]:
# Assignment 5: Merging DataFrame

#* 1. Create two Pandas DataFrames with a common column. Merge the DataFrames using common column

# First DataFrame
df1 = pd.DataFrame({
    "product_id": [1, 2, 3, 4],
    "product": ["Laptop", "Phone", "Tablet", "Monitor"]
})

# Second DataFrame
df2 = pd.DataFrame({
    "product_id": [1, 2, 3, 5],
    "sales": [1200, 850, 600, 300]
})

merged_df = pd.merge(df1, df2, on="product_id")

print(merged_df)

   product_id product  sales
0           1  Laptop   1200
1           2   Phone    850
2           3  Tablet    600


In [None]:
#* 2. Create two pandas DataFrame with different columns. Concatenate the DataFrames along the rows and along the columns

# First DataFrame (products info)
df1 = pd.DataFrame({
    "product": ["Laptop", "Phone", "Tablet"],
    "price": [80000, 40000, 25000]
})

# Second DataFrame (customers info)
df2 = pd.DataFrame({
    "customer": ["Amit", "Riya", "John"],
    "city": ["Delhi", "Mumbai", "Pune"]
})

row_concat = pd.concat([df1, df2], axis=0)
print(row_concat)

col_concat = pd.concat([df1, df2], axis=1)
print(col_concat)

  product    price customer    city
0  Laptop  80000.0      NaN     NaN
1   Phone  40000.0      NaN     NaN
2  Tablet  25000.0      NaN     NaN
0     NaN      NaN     Amit   Delhi
1     NaN      NaN     Riya  Mumbai
2     NaN      NaN     John    Pune
  product  price customer    city
0  Laptop  80000     Amit   Delhi
1   Phone  40000     Riya  Mumbai
2  Tablet  25000     John    Pune


In [2]:
# Assignment 6

#* 1. Create a Pandas DataFrame with a datetie index and one column filled with random integers. Resample the DataFrame to compute the monthly mean of the values.

import pandas as pd
import numpy as np

# Create a Daily DateTimeIndex for 6 months (approx 180 days)
start_date = "2025-01-01"
periods = 180
index = pd.date_range(start=start_date, periods=periods, freq='D')

# Generate the DataFrame
random_data = np.random.randint(10, 100, size=periods)

df = pd.DataFrame(data=random_data, index=index, columns=['Value'])

print(df.head())

# Resample the DataFrame to compute monthly Mean

# The .resample() method is used for frequency conversion/aggregation.
# 'M' stands for Month-End frequency
# .mean() is the aggregation function applied to each month's data/

monthly_mean_df = df.resample('M').mean()

print(f'\n ### Resampled Monthly Mean DataFrame ###')
print(monthly_mean_df)

            Value
2025-01-01     86
2025-01-02     59
2025-01-03     12
2025-01-04     61
2025-01-05     26

 ### Resampled Monthly Mean DataFrame ###
                Value
2025-01-31  48.645161
2025-02-28  57.285714
2025-03-31  51.000000
2025-04-30  51.133333
2025-05-31  49.193548
2025-06-30  66.551724


  monthly_mean_df = df.resample('M').mean()


In [8]:
#* 2. Create a Pandas DataFrame with a datetimeIndex and ranging from '2021-01-01' to '2021-12-31' and one column filled with random integers. Compute the rolling mea with window of 7 days.

# Create a DatatimeIndex with range 
start_date = '2021-01-01'
end_date = '2021-12-31'

index = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a DataFrame
random_data = np.random.randint(1, 365, size=len(index))
df = pd.DataFrame(data=random_data, index=index, columns=['Value'])
print(df.head())

# Compute the 7 Day rolling mean

df['rolling_mean'] = df['Value'].rolling(window='7D', min_periods=7).mean()
df.head(20)

            Value
2021-01-01     83
2021-01-02    135
2021-01-03    305
2021-01-04     45
2021-01-05    165


Unnamed: 0,Value,rolling_mean
2021-01-01,83,
2021-01-02,135,
2021-01-03,305,
2021-01-04,45,
2021-01-05,165,
2021-01-06,299,
2021-01-07,284,188.0
2021-01-08,294,218.142857
2021-01-09,181,224.714286
2021-01-10,105,196.142857


In [13]:
# Assignemt 7

#* 1. Create a Pandas DataFrame with a MultiIndex (hierarchical index), Peform some basic indexing and slicing opeations one the MultiIndex DataFrame

# Define the labels for the two index levels
level1 = ['North', 'South', 'East', 'West']
level2 = [2021, 2022, 2023, 2024]

# Create the MultiIndex object 
index = pd.MultiIndex.from_product([level1, level2], names=['Region', 'Year'])

# Create a DataFrame
df = pd.DataFrame(np.random.randint(10, 100, size=(16, 2)),  index=index, columns=['Sales', 'Profit'])
print(df)

# Indexing 
df.loc['North']
df.loc['East', 2023]


             Sales  Profit
Region Year               
North  2021     72      61
       2022     51      81
       2023     57      56
       2024     34      30
South  2021     60      61
       2022     45      64
       2023     58      73
       2024     92      47
East   2021     64      27
       2022     95      47
       2023     39      74
       2024     97      88
West   2021     32      73
       2022     90      40
       2023     61      78
       2024     63      26


Sales     39
Profit    74
Name: (East, 2023), dtype: int32

In [25]:
#* 2. Create a Pandas DataFrame with MultiIndex consisting 'Category' and 'SubCategory'. Fill the DataFrame with random data and compute the sum of values for each 'Category' and 'SubCategory'ArithmeticError

# Create levels with labels
level1 = ['Electronics', 'Fitness']
levek2 = ['Gadgets', 'Home_appliances']

# Create index
index = pd.MultiIndex.from_product([level1, levek2], names=['Category', 'Subcategory'])

# CREATE A DATAFRAME
df = pd.DataFrame(np.random.randint(1, 500, size=(4,2)), index=index, columns=['Price', 'Availability_count'])
print(df)

print(f'Sum of Electronics Gadgets: {df.loc['Electronics', 'Gadgets']['Price'].sum()}')

print(f'Sum of Electronics Home appliances: {df.loc['Electronics', 'Home_appliances'].sum()}')

print(f'Sum of Fitness Gadgets: {df.loc['Fitness', 'Gadgets'].sum()}')

print(f'Sum of Home appliances Gadgets: {df.loc['Fitness', 'Home_appliances'].sum()}')

                             Price  Availability_count
Category    Subcategory                               
Electronics Gadgets            139                 187
            Home_appliances    145                 302
Fitness     Gadgets            294                 421
            Home_appliances    439                  33
Sum of Electronics Gadgets: 139
Sum of Electronics Home appliances: 447
Sum of Fitness Gadgets: 715
Sum of Home appliances Gadgets: 472


In [27]:
# Assignment 8: Pivot Tables

#* 1. Create a Pandas DataFrame with columns 'Date', 'Category', and 'Value'. Create a pivot table to compute the sum of 'Value' for each 'Category' by 'Date'.

import pandas as pd
import numpy as np

# 1. Prepare Data
# Create a date range for the 'Date' column
dates = pd.date_range('2025-01-01', periods=10)

# Create a list of categorical values (repeating for 10 entries)
categories = ['A', 'B'] * 5 

# Create random integer data for the 'Value' column
values = np.random.randint(10, 100, size=10)

# 2. Create the DataFrame using a dictionary
data = {
    'Date': dates,
    'Category': categories,
    'Value': values
}

df = pd.DataFrame(data)

print("### Created Pandas DataFrame ###")
print(df)

# Pivot
pivot_table = pd.pivot_table(
    df, 
    values='Value',
    index='Date',
    columns='Category',
    aggfunc='sum'
)

print(pivot_table)

### Created Pandas DataFrame ###
        Date Category  Value
0 2025-01-01        A     62
1 2025-01-02        B     64
2 2025-01-03        A     70
3 2025-01-04        B     51
4 2025-01-05        A     82
5 2025-01-06        B     25
6 2025-01-07        A     75
7 2025-01-08        B     57
8 2025-01-09        A     27
9 2025-01-10        B     61
Category       A     B
Date                  
2025-01-01  62.0   NaN
2025-01-02   NaN  64.0
2025-01-03  70.0   NaN
2025-01-04   NaN  51.0
2025-01-05  82.0   NaN
2025-01-06   NaN  25.0
2025-01-07  75.0   NaN
2025-01-08   NaN  57.0
2025-01-09  27.0   NaN
2025-01-10   NaN  61.0


In [30]:
#* 2. Create a Pandas DataFrame with columns 'Year', 'Quarter', and 'Revenue'. Create a pivot table to compute the mean 'Revenue' for each 'Quarter' by 'Year'.

import pandas as pd
import numpy as np

# 1. Prepare Data for Hierarchical Structure
# Create lists for Year and Quarter that repeat in the desired pattern
years = [2023] * 4 + [2024] * 4  # Two years
quarters = [1, 2, 3, 4] * 2       # Four quarters per year

# Generate random revenue data (e.g., between 500 and 1500 million)
# Total length should match the length of years/quarters (8 periods)
revenue = np.random.randint(500, 1500, size=8)

# 2. Create the DataFrame using a dictionary
data = {
    'Year': years,
    'Quarter': quarters,
    'Revenue': revenue
}

df_revenue = pd.DataFrame(data)

print("### Quarterly Revenue DataFrame ###")
print(df_revenue)

pivot_table = pd.pivot_table(
    df_revenue, 
    values='Revenue',
    index='Year',
    columns='Quarter',
    aggfunc='mean'
)

print(pivot_table)

### Quarterly Revenue DataFrame ###
   Year  Quarter  Revenue
0  2023        1      985
1  2023        2     1059
2  2023        3     1369
3  2023        4      970
4  2024        1      990
5  2024        2     1336
6  2024        3     1337
7  2024        4     1176
Quarter      1       2       3       4
Year                                  
2023     985.0  1059.0  1369.0   970.0
2024     990.0  1336.0  1337.0  1176.0


In [45]:
# Assignment 9: Applying Functions

#* 1. Create a Pandas DataFrame with 3 columns and 5 rows filled with random integers. Apply a function that doubles the values of the DataFrame.

np.random.seed(42)
df = pd.DataFrame(np.random.randint(1, 10, (5, 3)), columns=['A', 'B', 'C'])

print(df)

for column in df:
    df[column] = df[column].apply((lambda x: x * x for x in column), axis=0)

df

   A  B  C
0  7  4  8
1  5  7  3
2  7  8  5
3  4  8  8
4  3  6  5


Unnamed: 0,A,B,C
0,49,16,64
1,25,49,9
2,49,64,25
3,16,64,64
4,9,36,25


In [65]:
#* 2. Create a Pandas DataFrame with 3 columns and 6 rows filled with random integers. Apply the lambda function to create a new column that is the sum of the existing columns.

np.random.seed(42)
df = pd.DataFrame(np.random.randint(1, 100, (6, 3)), columns=['A', 'B', 'C'])

print(df)

df['sum_of_columns'] = df.apply(lambda col: col.sum(), axis=1)
df

    A   B   C
0  52  93  15
1  72  61  21
2  83  87  75
3  75  88  24
4   3  22  53
5   2  88  30


Unnamed: 0,A,B,C,sum_of_columns
0,52,93,15,160
1,72,61,21,154
2,83,87,75,245
3,75,88,24,187
4,3,22,53,78
5,2,88,30,120


In [67]:
# Assignment 10: Working with Text Data

#* 1. Create a Pandas Series with 5 random text strings. convert all the strings to uppercase.

data = ['Hello', 'World', 'I', 'Am', 'Prince']
df = pd.Series(data)
df.str.upper()

0     HELLO
1     WORLD
2         I
3        AM
4    PRINCE
dtype: object

In [77]:
#* 2. Create a Pandas Series with 5 random text strings. Extract the first three characters of each string

df.str[:3]

0    Hel
1    Wor
2      I
3     Am
4    Pri
dtype: object