### Q1. List any five functions of the pandas library with execution.

In [1]:
import pandas as pd

# Create a sample DataFrame
data = {
    "Name": ["Alice", "Bob", "Claire", "David", "Eva"],
    "Age": [25, 30, 27, 22, 35],
    "Gender": ["Female", "Male", "Female", "Male", "Female"],
    "Score": [85, 90, 88, 76, 95]
}
df = pd.DataFrame(data)

# 1. head() - Get the first 3 rows
print("1. head():")
print(df.head(3))

# 2. describe() - Get descriptive statistics
print("\n2. describe():")
print(df.describe())

# 3. dropna() - Drop rows with any missing values (for demonstration, add a NaN value first)
df_with_nan = df.copy()
df_with_nan.loc[2, "Age"] = None
print("\n3. dropna():")
print(df_with_nan.dropna())

# 4. groupby() - Group by Gender and get mean age
print("\n4. groupby():")
print(df.groupby("Gender")["Age"].mean())

# 5. merge() - Merge two DataFrames (for demonstration, create another DataFrame)
data2 = {
    "Name": ["Alice", "Bob", "Claire"],
    "City": ["New York", "Los Angeles", "Chicago"]
}
df2 = pd.DataFrame(data2)
print("\n5. merge():")
print(pd.merge(df, df2, on="Name"))


1. head():
     Name  Age  Gender  Score
0   Alice   25  Female     85
1     Bob   30    Male     90
2  Claire   27  Female     88

2. describe():
             Age      Score
count   5.000000   5.000000
mean   27.800000  86.800000
std     4.969909   7.049823
min    22.000000  76.000000
25%    25.000000  85.000000
50%    27.000000  88.000000
75%    30.000000  90.000000
max    35.000000  95.000000

3. dropna():
    Name   Age  Gender  Score
0  Alice  25.0  Female     85
1    Bob  30.0    Male     90
3  David  22.0    Male     76
4    Eva  35.0  Female     95

4. groupby():
Gender
Female    29.0
Male      26.0
Name: Age, dtype: float64

5. merge():
     Name  Age  Gender  Score         City
0   Alice   25  Female     85     New York
1     Bob   30    Male     90  Los Angeles
2  Claire   27  Female     88      Chicago


### Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [2]:
import pandas as pd

def reindex_dataframe(df):
    # Create a new index starting from 1 and incrementing by 2
    new_index = pd.RangeIndex(start=1, stop=2*len(df)+1, step=2)
    # Assign the new index to the DataFrame
    df.index = new_index
    return df

# Sample DataFrame
data = {
    "A": [10, 20, 30],
    "B": [40, 50, 60],
    "C": [70, 80, 90]
}
df = pd.DataFrame(data)

# Re-index the DataFrame
reindexed_df = reindex_dataframe(df)
print(reindexed_df)


    A   B   C
1  10  40  70
3  20  50  80
5  30  60  90


### Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The function should print the sum to the console.

In [3]:
import pandas as pd

def sum_first_three_values(df):
    # Ensure the DataFrame has a 'Values' column
    if 'Values' not in df.columns:
        print("The DataFrame does not have a 'Values' column.")
        return

    # Calculate the sum of the first three values in the 'Values' column
    sum_of_values = df['Values'].head(3).sum()

    # Print the sum to the console
    print(f"The sum of the first three values in the 'Values' column is: {sum_of_values}")

# Sample DataFrame
data = {
    'Values': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)

# Call the function with the sample DataFrame
sum_first_three_values(df)


The sum of the first three values in the 'Values' column is: 60


### Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column 'Word_Count' that contains the number of words in each row of the 'Text' column.

In [3]:
import pandas as pd

def add_word_count_column(df):
    # Define a function to count words in a string
    def word_count(text):
        # Split the text by spaces and return the length of the resulting list
        # text can be None or NaN, so we need to handle that
        if pd.isna(text):
            return 0
        return len(text.split())

    # Apply the word_count function to each row in the 'Text' column
    df['Word_Count'] = df['Text'].apply(word_count)

# Example usage:
data = {'Text': ['This is a test', 'Another test case', 'Pandas is great', None, '']}
df = pd.DataFrame(data)
add_word_count_column(df)
print(df)


                Text  Word_Count
0     This is a test           4
1  Another test case           3
2    Pandas is great           3
3               None           0
4                              0


### Q5. How are DataFrame.size() and DataFrame.shape() different?

- `DataFrame.shape` provides the structure of the DataFrame in terms of rows and columns.
- `DataFrame.size` gives the total number of data points in the DataFrame.

Comparison table:

| Attribute       | Description                               | Output             | Example Output      |
|-----------------|-------------------------------------------|--------------------|---------------------|
| `DataFrame.shape` | Number of rows and columns (dimensionality) | Tuple              | `(3, 2)`            |
| `DataFrame.size`  | Total number of elements                  | Integer            | `6`                 |



In [5]:
import pandas as pd

# Create a sample DataFrame
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Get the shape of the DataFrame
print("Shape:", df.shape)  # Output: Shape: (3, 2)

# Get the size of the DataFrame
print("Size:", df.size)    # Output: Size: 6

Shape: (3, 2)
Size: 6


### Q6. Which function of pandas do we use to read an excel file?

### Parameters

- **`io`**: str, bytes, ExcelFile, xlrd.Book, path object, or file-like object. The path to the Excel file.
- **`sheet_name`**: str, int, list, or None, default 0. Specifies which sheet or sheets to read.
  - `0` or sheet name: read the first sheet or the specified sheet.
  - `None`: read all sheets into a dictionary of DataFrames.
  - List of sheet names or sheet indices: read specified sheets into a dictionary of DataFrames.
- **`header`**: int, list of int, default 0. Row (0-indexed) to use for column labels.
- **`names`**: array-like, default None. List of column names to use.
- **`index_col`**: int, list of int, default None. Column (0-indexed) to use as the row labels.
- **`usecols`**: str, list-like, or callable, default None. Return a subset of the columns.
- **`dtype`**: Type name or dict of column -> type, default None. Data type for data or columns.
- **`engine`**: str, default None. If io is not a buffer or path, this must be set to the engine you want to use (e.g., 'xlrd', 'openpyxl', or 'odf').


### Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email 
addresses in the format 'username@domain.com'. Write a Python function that creates a new colum 
'Username' in df that contains only the username part of each email addres .
The username is the part of the email address that appears before the '@' symbol. For example, if  he
email address is 'john.doe@example.com', the 'Username' column should contain 'john.doe'.  our
function should extract the username from each email address and store it in the new 'User ame'
column.

In [7]:
import pandas as pd

def add_username_column(df):
    # Define a function to extract the username from an email address
    def extract_username(email):
        return email.split('@')[0]

    # Apply the function to the 'Email' column and create a new 'Username' column
    df['Username'] = df['Email'].apply(extract_username)

# Example usage
data = {'Email': ['john.doe@example.com', 'jane.smith@domain.com', 'alice.brown@another.com']}
df = pd.DataFrame(data)
add_username_column(df)
print(df)


                     Email     Username
0     john.doe@example.com     john.doe
1    jane.smith@domain.com   jane.smith
2  alice.brown@another.com  alice.brown


### Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
function should return a new DataFrame that contains only the selected rows.
For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2  Your function should select the following rows: A B C
1 8 2 7
4 9 1 2
The function should return a new DataFrame that contains only the selected rows.

In [8]:
import pandas as pd

def select_rows(df):
    # Apply the conditions to select rows
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Example usage:
data = {
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
}

df = pd.DataFrame(data)
selected_df = select_rows(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


### Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean, median, and standard deviation of the values in the 'Values' column.

In [9]:
import pandas as pd

def calculate_statistics(df):
    # Calculate mean, median, and standard deviation of the 'Values' column
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_dev_value = df['Values'].std()

    # Return the results as a dictionary
    return {
        'Mean': mean_value,
        'Median': median_value,
        'Standard Deviation': std_dev_value
    }

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
statistics = calculate_statistics(df)
print(statistics)


{'Mean': 30.0, 'Median': 30.0, 'Standard Deviation': 15.811388300841896}


### Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days for each row in the DataFrame. The moving average should be calculated using a window of size 7 and should include the current day.

In [10]:
import pandas as pd

def add_moving_average(df):
    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Sort DataFrame by 'Date' to ensure chronological order
    df = df.sort_values(by='Date')
    
    # Calculate the 7-day moving average of 'Sales' column
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    
    return df

# Example usage
data = {
    'Date': ['2024-07-01', '2024-07-02', '2024-07-03', '2024-07-04', '2024-07-05', 
             '2024-07-06', '2024-07-07', '2024-07-08', '2024-07-09', '2024-07-10'],
    'Sales': [200, 220, 250, 270, 300, 280, 310, 330, 350, 400]
}

df = pd.DataFrame(data)
df_with_moving_avg = add_moving_average(df)
print(df_with_moving_avg)


        Date  Sales  MovingAverage
0 2024-07-01    200     200.000000
1 2024-07-02    220     210.000000
2 2024-07-03    250     223.333333
3 2024-07-04    270     235.000000
4 2024-07-05    300     248.000000
5 2024-07-06    280     253.333333
6 2024-07-07    310     261.428571
7 2024-07-08    330     280.000000
8 2024-07-09    350     298.571429
9 2024-07-10    400     320.000000


### Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g. Monday, Tuesday) corresponding to each date in the 'Date' column. For example, if df contains the following values:
Date <br>
0 2023-01-01 <br>
1 2023-01-02 <br>
2 2023-01-03 <br>
3 2023-01-04 <br>
4 2023-01-05 <br>
Your function should create the following DataFrame: <br>
<br>
Date Weekday <br>
0 2023-01-01 Sunday <br>
1 2023-01-02 Monday <br>
2 2023-01-03 Tuesday <br>
3 2023-01-04 Wednesday <br>
4 2023-01-05 Thursday <br>
The function should return the modified DataFrame.

In [11]:
import pandas as pd

def add_weekday_column(df):
    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Create the 'Weekday' column by extracting the weekday name
    df['Weekday'] = df['Date'].dt.day_name()
    
    return df

# Example usage
data = {
    'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
}

df = pd.DataFrame(data)
df_with_weekday = add_weekday_column(df)
print(df_with_weekday)


        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


### Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [12]:
import pandas as pd

def filter_dates(df):
    # Ensure 'Date' column is in datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Define the start and end date for the filter
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    
    # Filter rows where 'Date' is between start_date and end_date
    filtered_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    return filtered_df

# Example usage
data = {
    'Date': ['2023-01-01', '2023-01-15', '2023-02-01', '2023-01-30', '2023-01-05']
}

df = pd.DataFrame(data)
filtered_df = filter_dates(df)
print(filtered_df)


        Date
0 2023-01-01
1 2023-01-15
3 2023-01-30
4 2023-01-05


### Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

In [13]:
import pandas as pd

# Creating a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Displaying the DataFrame
print(df)


   A  B
0  1  4
1  2  5
2  3  6
