# Q1. List any five functions of the pandas library with execution.

__Ans. :__<br>

1. __dropna():__ This function is used to remove missing or __NaN__ values from a DataFrame.

In [1]:
import pandas as pd

# Create a DataFrame with missing values
df = pd.DataFrame({'A': [1, 2, None, 4],
                   'B': [None, 6, 7, 8]})

# Drop rows with missing values
df = df.dropna()

# Display the remaining DataFrame
print(df)

     A    B
1  2.0  6.0
3  4.0  8.0


2. __groupby():__ This function is used to group rows in a DataFrame by one or more columns, and apply an aggregation function to each group.

In [2]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({'Category': ['A', 'A', 'B', 'B', 'B'],
                   'Value': [1, 2, 3, 4, 5]})

# Group the DataFrame by 'Category' and calculate the mean value for each group
grouped = df.groupby('Category').mean()

# Display the grouped DataFrame
print(grouped)

          Value
Category       
A           1.5
B           4.0


3. __merge():__ This function is used to merge two DataFrames based on a common column.

In [3]:
import pandas as pd

# Create two DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]})

df2 = pd.DataFrame({'A': [2, 3, 4],
                    'C': [7, 8, 9]})

# Merge the two DataFrames based on the 'A' column
merged = pd.merge(df1, df2, on='A')

# Display the merged DataFrame
print(merged)

   A  B  C
0  2  5  7
1  3  6  8


4. __pivot_table():__ This function is used to create a pivot table from a DataFrame.

In [4]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({'Category': ['A', 'A', 'B', 'B', 'B'],
                   'Value': [1, 2, 3, 4, 5]})

# Create a pivot table with 'Category' as the index and the mean value of 'Value' for each category
pivot = df.pivot_table(index='Category', values='Value', aggfunc='mean')

# Display the pivot table
print(pivot)

          Value
Category       
A           1.5
B           4.0


5. Creating a dataframe using __pd.DataFrame()__ function:

In [6]:
import pandas as pd

data = {'Name': ['John', 'Sara', 'Bob', 'Jenny'], 
        'Age': [25, 28, 22, 30],
        'City': ['New York', 'London', 'Paris', 'Sydney']}
df = pd.DataFrame(data)
print(df)

    Name  Age      City
0   John   25  New York
1   Sara   28    London
2    Bob   22     Paris
3  Jenny   30    Sydney


# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [7]:
import pandas as pd

def reindex_dataframe(df):
    new_index = range(1, len(df)*2, 2)
    df = df.reset_index(drop=True)
    df.index = new_index
    return df

# create a sample dataframe
data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
df = pd.DataFrame(data)

# reindex the dataframe
new_df = reindex_dataframe(df)
print(df)
print(new_df)

   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9
   A  B  C
1  1  4  7
3  2  5  8
5  3  6  9


# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.
For example, if the 'Values' column of df contains the values __[10, 20, 30, 40, 50]__, your function should
calculate and print the sum of the first three values, which is 60.

In [8]:
import pandas as pd

def sum_first_three_values(df):
    sum = 0
    for i in range(3):
        sum += df.loc[i, 'Values']
    print('Sum of the first three values:', sum)

# create a sample dataframe
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# calculate the sum of the first three values
sum_first_three_values(df)

Sum of the first three values: 60


# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [9]:
import pandas as pd

def add_word_count_column(df):
    df['Word_Count'] = df['Text'].str.split().apply(len)
    return df

# create a sample dataframe
data = {'Text': ['This is a sentence', 'Another sentence', 'And a third sentence']}
df = pd.DataFrame(data)

# add the word count column
df = add_word_count_column(df)

print(df)

                   Text  Word_Count
0    This is a sentence           4
1      Another sentence           2
2  And a third sentence           4


# Q5. How are DataFrame.size() and DataFrame.shape() different?

__Ans. :__<br>

Both __DataFrame.size__ and __DataFrame.shape__ are attributes of Pandas DataFrame that are used to obtain information about the size and shape of the DataFrame. However, they provide different information about the DataFrame.

__DataFrame.size__ returns the number of elements in the DataFrame, which is equal to the number of rows multiplied by the number of columns. It does not provide any information about the shape or dimensions of the DataFrame.

On the other hand, __DataFrame.shape__ returns a tuple of two integers, which represent the number of rows and columns in the DataFrame, respectively. So, __DataFrame.shape__ provides information about the shape or dimensions of the DataFrame.

# Q6. Which function of pandas do we use to read an excel file?

__Ans. :__<br>

We can use the __read_excel()__ function of pandas to read an Excel file into a pandas DataFrame. This function allows us to read both XLS and XLSX files.

# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address.

The username is the part of the email address that appears before the __'@'__ symbol. For example, if the
email address is __'john.doe@example.com'__, the __'Username'__ column should contain __'john.doe'__. Your
function should extract the username from each email address and store it in the new __'Username'__
column.

In [10]:
import pandas as pd

def extract_username(df):
    # extract username from email address
    df['Username'] = df['Email'].str.split('@').str[0]
    return df

# create a sample DataFrame with 'Email' column
data = {'Email': ['john.doe@example.com', 'jane.smith@example.com', 'bob.johnson@example.com']}
df = pd.DataFrame(data)

# call the extract_username function
df = extract_username(df)

# print the updated DataFrame
print(df)

                     Email     Username
0     john.doe@example.com     john.doe
1   jane.smith@example.com   jane.smith
2  bob.johnson@example.com  bob.johnson


# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The function should return a new DataFrame that contains only the selected rows.

In [None]:
For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2

Your function should select the following rows: A B C
1 8 2 7
4 9 1 2
The function should return a new DataFrame that contains only the selected rows.

In [11]:
import pandas as pd

def select_rows(df):
    # select rows based on condition
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# create a sample DataFrame
data = {'A': [3, 8, 6, 2, 9], 'B': [5, 2, 9, 3, 1], 'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)

# call the select_rows function
selected_rows = select_rows(df)

# print the selected rows
print(selected_rows)

   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [12]:
import pandas as pd

def calculate_statistics(df):
    # calculate mean, median, and standard deviation
    mean = df['Values'].mean()
    median = df['Values'].median()
    std_dev = df['Values'].std()
    
    # print the results
    print("Mean:", mean)
    print("Median:", median)
    print("Standard Deviation:", std_dev)

# create a sample DataFrame
data = {'Values': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data)

# call the calculate_statistics function
calculate_statistics(df)

Mean: 3.0
Median: 3.0
Standard Deviation: 1.5811388300841898


# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

In [13]:
import pandas as pd

def add_moving_average(df):
    # calculate moving average using a window of size 7 and include current day
    ma = df['Sales'].rolling(window=7, min_periods=1).mean()
    
    # add moving average as a new column in the DataFrame
    df['MovingAverage'] = ma
    
    # return the modified DataFrame
    return df

# create a sample DataFrame
data = {'Date': pd.date_range('2022-01-01', '2022-01-10'), 'Sales': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55]}
df = pd.DataFrame(data)

# call the add_moving_average function
df = add_moving_average(df)

# print the modified DataFrame
print(df)

        Date  Sales  MovingAverage
0 2022-01-01     10           10.0
1 2022-01-02     15           12.5
2 2022-01-03     20           15.0
3 2022-01-04     25           17.5
4 2022-01-05     30           20.0
5 2022-01-06     35           22.5
6 2022-01-07     40           25.0
7 2022-01-08     45           30.0
8 2022-01-09     50           35.0
9 2022-01-10     55           40.0


# Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g. Monday, Tuesday) corresponding to each date in the 'Date' column.

In [None]:
For example, if df contains the following values:
Date
0 2023-01-01
1 2023-01-02
2 2023-01-03
3 2023-01-04
4 2023-01-05
Your function should create the following DataFrame:

Date Weekday
0 2023-01-01 Sunday
1 2023-01-02 Monday
2 2023-01-03 Tuesday
3 2023-01-04 Wednesday
4 2023-01-05 Thursday
The function should return the modified DataFrame.

In [27]:
import pandas as pd

def add_weekday(df):
    df['Weekday'] = df['Date'].dt.day_name()
    return df

df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']})
df['Date'] = pd.to_datetime(df['Date'])
df = add_weekday(df)
print(df)

        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


# Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [25]:
import pandas as pd

def select_rows_between_dates(df):
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    selected_rows = df.loc[mask]
    return selected_rows

df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-02', '2023-05-02', '2023-05-19', '2023-05-21']})
selected_rows = select_rows_between_dates(df)
print(df)
print(selected_rows)

         Date
0  2023-01-01
1  2023-01-02
2  2023-05-02
3  2023-05-19
4  2023-05-21
         Date
0  2023-01-01
1  2023-01-02


# Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

__Ans. :__<br>

To use the basic functions of pandas, the first and foremost necessary library that needs to be imported is pandas itself. You can import it using the following command:

In [28]:
import pandas as pd