### Q1. List any five functions of the pandas library with execution.

In [None]:
### read_csv(): reads a CSV file into a DataFrame

import pandas as pd
df = pd.read_csv('data.csv')
print(df)

In [4]:
### head(): returns the first few rows of a DataFrame
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.head())

   A  B
0  1  4
1  2  5
2  3  6


In [6]:
### info(): prints a concise summary of a DataFrame

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
dtypes: int64(2)
memory usage: 176.0 bytes


In [7]:
### groupby(): groups a DataFrame by one or more columns and performs aggregation operations

import pandas as pd
df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 5, 5, 6]})
grouped = df.groupby('A')
print(grouped.sum())

    B
A    
1   4
2  10
3   6


In [8]:
### pivot_table(): creates a pivot table from a DataFrame
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [4, 5, 5, 6], 'C': [7, 8, 9, 10]})
pivot = pd.pivot_table(df, values='C', index='A', columns='B')
print(pivot)



B    4    5     6
A                
1  7.0  NaN   NaN
2  NaN  8.5   NaN
3  NaN  NaN  10.0


### Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [9]:
import pandas as pd

def reindex_df(df):
    df.index = range(1, len(df) * 2 + 1, 2)
    return df

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
print(reindex_df(df))

   A  B  C
1  1  4  7
3  2  5  8
5  3  6  9


### Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column.

In [10]:
import pandas as pd

def sum_first_three_values(df):
    sum = 0
    for i, row in df.head(3).iterrows():
        sum += row['Values']
    print(sum)

df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
sum_first_three_values(df)

60


### Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [11]:
import pandas as pd

def add_word_count_column(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(x.split()))
    return df

df = pd.DataFrame({'Text': ['hello world', 'this is a test', 'foo bar baz']})
print(add_word_count_column(df))

             Text  Word_Count
0     hello world           2
1  this is a test           4
2     foo bar baz           3


### Q5. How are DataFrame.size() and DataFrame.shape() different?

DataFrame.size() returns the total number of elements in the DataFrame, while DataFrame.shape() returns a tuple representing the dimensionality of the DataFrame (number of rows, number of columns).

In [12]:
import pandas as pd
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.size)  # Output: 6
print(df.shape)  # Output: (3, 2)

6
(3, 2)


### Q6. Which function of pandas do we use to read an excel file?

We use the read_excel() function to read an Excel file into a Pandas DataFrame.

In [None]:
import pandas as pd
df = pd.read_excel('example.xlsx')
print(df)

### Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address.

In [13]:
import pandas as pd

def extract_username(df):
    df['Username'] = df['Email'].apply(lambda x: x.split('@')[0])
    return df

df = pd.DataFrame({'Email': ['john.doe@example.com', 'jane.doe@example.com']})
print(extract_username(df))

                  Email  Username
0  john.doe@example.com  john.doe
1  jane.doe@example.com  jane.doe


### Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10.

In [14]:
import pandas as pd

def select_rows(df):
    return df[(df['A'] > 5) & (df['B'] < 10)]

df = pd.DataFrame({'A': [3, 8, 6, 2], 'B': [5, 2, 9, 3], 'C': [1, 7, 4, 5]})
print(select_rows(df))

   A  B  C
1  8  2  7
2  6  9  4


### Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [15]:
import pandas as pd

def calculate_stats(df):
    mean = df['Values'].mean()
    median = df['Values'].median()
    std = df['Values'].std()
    print(f'Mean: {mean}, Median: {median}, Standard Deviation: {std}')

df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
calculate_stats(df)

Mean: 30.0, Median: 30.0, Standard Deviation: 15.811388300841896


### Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame.

In [16]:
import pandas as pd

def calculate_moving_average(df):
    df['MovingAverage'] = df['Sales'].rolling(window=7).mean()
    return df

df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08'], 
                   'Sales': [10, 20, 30, 40, 50, 60, 70, 80]})
print(calculate_moving_average(df))

         Date  Sales  MovingAverage
0  2023-01-01     10            NaN
1  2023-01-02     20            NaN
2  2023-01-03     30            NaN
3  2023-01-04     40            NaN
4  2023-01-05     50            NaN
5  2023-01-06     60            NaN
6  2023-01-07     70           40.0
7  2023-01-08     80           50.0


### Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g. Monday, Tuesday) corresponding to each date in the 'Date' column.

In [17]:
import pandas as pd

def add_weekday_column(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Weekday'] = df['Date'].dt.day_name()
    return df

df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']})
print(add_weekday_column(df))

        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


### Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [18]:
import pandas as pd

def select_date_range(df):
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    df['Date'] = pd.to_datetime(df['Date'])
    return df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

df = pd.DataFrame({'Date': ['2022-12-31', '2023-01-01', '2023-01-02', '2023-01-31', '2023-02-01']})
print(select_date_range(df))

        Date
1 2023-01-01
2 2023-01-02
3 2023-01-31


### Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

The first and foremost necessary library that needs to be imported is pandas. You can import it using the following code:

In [19]:
import pandas as pd

This imports the pandas library and assigns it the alias pd, which is commonly used in Pandas code.