# Q1. List any five functions of the pandas library with execution.

In [None]:
import pandas as pd

# Sample DataFrame
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}
df = pd.DataFrame(data)

# Five functions of the pandas library
print(df.head())        # Display the first 5 rows of the DataFrame
print(df.info())        # Display concise summary information of the DataFrame
print(df.describe())    # Generate descriptive statistics of the DataFrame
print(df.shape)         # Return the dimensions of the DataFrame (rows, columns)
print(df.columns)       # Return the column labels of the DataFrame


   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       3 non-null      int64
 1   B       3 non-null      int64
 2   C       3 non-null      int64
dtypes: int64(3)
memory usage: 200.0 bytes
None
         A    B    C
count  3.0  3.0  3.0
mean   2.0  5.0  8.0
std    1.0  1.0  1.0
min    1.0  4.0  7.0
25%    1.5  4.5  7.5
50%    2.0  5.0  8.0
75%    2.5  5.5  8.5
max    3.0  6.0  9.0
(3, 3)
Index(['A', 'B', 'C'], dtype='object')


# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row

In [2]:
def reindex_dataframe(df):
    new_index = [i*2 + 1 for i in range(len(df))]
    df.index = new_index
    return df

# Example usage:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
df = reindex_dataframe(df)
print(df)


   A  B  C
1  1  4  7
3  2  5  8
5  3  6  9


# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.

In [3]:
def sum_first_three_values(df):
    total = df['Values'].head(3).sum()
    print("Sum of first three values:", total)

# Example usage:
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
sum_first_three_values(df)


Sum of first three values: 60


# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [4]:
def count_words(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))
    return df

# Example usage:
df = pd.DataFrame({'Text': ['This is a sample sentence', 'Another sentence', 'One more']})
df = count_words(df)
print(df)


                        Text  Word_Count
0  This is a sample sentence           5
1           Another sentence           2
2                   One more           2


# Q5. How are DataFrame.size() and DataFrame.shape() different?

DataFrame.size(): Returns the number of elements in the DataFrame, which is equal to the product of the number of rows and the number of columns.

DataFrame.shape(): Returns a tuple representing the dimensions of the DataFrame, where the first element is the number of rows and the second element is the number of columns.

# Q6. Which function of pandas do we use to read an excel file?

The function used to read an Excel file in pandas is pd.read_excel().

# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email addresses in the format 'username@domain.com'. Write a Python function that creates a new column 'Username' in df that contains only the username part of each email address.

In [5]:
def extract_username(df):
    df['Username'] = df['Email'].apply(lambda x: x.split('@')[0])
    return df

# Example usage:
df = pd.DataFrame({'Email': ['john.doe@example.com', 'alice.smith@example.com']})
df = extract_username(df)
print(df)


                     Email     Username
0     john.doe@example.com     john.doe
1  alice.smith@example.com  alice.smith


# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The function should return a new DataFrame that contains only the selected rows.

In [6]:
import pandas as pd

def select_rows(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Example usage:
data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)
selected_df = select_rows(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [7]:
import pandas as pd

def calculate_statistics(df):
    mean = df['Values'].mean()
    median = df['Values'].median()
    std_dev = df['Values'].std()
    return mean, median, std_dev

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
mean, median, std_dev = calculate_statistics(df)
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

In [8]:
import pandas as pd

def calculate_moving_average(df):
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df

# Example usage:
data = {'Date': pd.date_range(start='2022-01-01', periods=10),
        'Sales': [100, 120, 130, 110, 150, 140, 160, 180, 170, 190]}
df = pd.DataFrame(data)
df = calculate_moving_average(df)
print(df)


        Date  Sales  MovingAverage
0 2022-01-01    100     100.000000
1 2022-01-02    120     110.000000
2 2022-01-03    130     116.666667
3 2022-01-04    110     115.000000
4 2022-01-05    150     122.000000
5 2022-01-06    140     125.000000
6 2022-01-07    160     130.000000
7 2022-01-08    180     141.428571
8 2022-01-09    170     148.571429
9 2022-01-10    190     157.142857


# Q11

In [9]:
import pandas as pd

def add_weekday_column(df):
    df['Weekday'] = df['Date'].dt.strftime('%A')
    return df

# Example usage:
data = {'Date': pd.date_range(start='2023-01-01', periods=5)}
df = pd.DataFrame(data)
df = add_weekday_column(df)
print(df)


        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


# Q12. Here's a Python function that selects all rows from a Pandas DataFrame where the date is between '2023-01-01' and '2023-01-31':

In [10]:
import pandas as pd

def select_rows_between_dates(df):
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    selected_rows = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    return selected_rows

# Example usage:
data = {'Date': pd.date_range(start='2023-01-01', end='2023-02-28')}
df = pd.DataFrame(data)
selected_df = select_rows_between_dates(df)
print(selected_df)


         Date
0  2023-01-01
1  2023-01-02
2  2023-01-03
3  2023-01-04
4  2023-01-05
5  2023-01-06
6  2023-01-07
7  2023-01-08
8  2023-01-09
9  2023-01-10
10 2023-01-11
11 2023-01-12
12 2023-01-13
13 2023-01-14
14 2023-01-15
15 2023-01-16
16 2023-01-17
17 2023-01-18
18 2023-01-19
19 2023-01-20
20 2023-01-21
21 2023-01-22
22 2023-01-23
23 2023-01-24
24 2023-01-25
25 2023-01-26
26 2023-01-27
27 2023-01-28
28 2023-01-29
29 2023-01-30
30 2023-01-31


# Q13. To use the basic functions of pandas, the first and foremost necessary library that needs to be imported is pandas itself. You can import it using the following statement: