In [1]:
'''Q1. List any five functions of the pandas library with execution.'''

import pandas as pd
from io import StringIO

# Sample CSV data
csv_data = """Name,Age,Fare
Alice,25,72.5
Bob,30,80.0
Charlie,35,50.0"""

# Using StringIO to simulate a file-like object
csv_file_like = StringIO(csv_data)

# 1. Reading the CSV data into a DataFrame
df = pd.read_csv(csv_file_like)
print("DataFrame created using read_csv():")
print(df)

# 2. Displaying the first 2 rows of the DataFrame
print("\nFirst 2 rows using head():")
print(df.head(2))

# 3. Generating descriptive statistics
print("\nDescriptive statistics using describe():")
print(df.describe())

# 4. Grouping by 'Age' and calculating the mean fare
grouped = df.groupby('Age')['Fare'].mean()
print("\nGrouped by 'Age' and mean 'Fare' using groupby():")
print(grouped)

# Sample data for another DataFrame
data2 = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df2 = pd.DataFrame(data2)

# 5. Merging the two DataFrames on the 'Name' column
merged_df = pd.merge(df, df2, on='Name')
print("\nMerged DataFrame using merge():")
print(merged_df)


DataFrame created using read_csv():
      Name  Age  Fare
0    Alice   25  72.5
1      Bob   30  80.0
2  Charlie   35  50.0

First 2 rows using head():
    Name  Age  Fare
0  Alice   25  72.5
1    Bob   30  80.0

Descriptive statistics using describe():
        Age       Fare
count   3.0   3.000000
mean   30.0  67.500000
std     5.0  15.612495
min    25.0  50.000000
25%    27.5  61.250000
50%    30.0  72.500000
75%    32.5  76.250000
max    35.0  80.000000

Grouped by 'Age' and mean 'Fare' using groupby():
Age
25    72.5
30    80.0
35    50.0
Name: Fare, dtype: float64

Merged DataFrame using merge():
      Name  Age  Fare         City
0    Alice   25  72.5     New York
1      Bob   30  80.0  Los Angeles
2  Charlie   35  50.0      Chicago


In [3]:
'''Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
DataFrame with a new index that starts from 1 and increments by 2 for each row.'''


#  Sample DataFrame
data = {
    'A': [10, 20, 30, 40, 50],
    'B': [60, 70, 80, 90, 100],
    'C': [110, 120, 130, 140, 150]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


# new_index = range(1, 2 * len(df)+1 ,2)

# df.index = new_index
# print("after reindexing : " , df)


def reindex_dataframe(df) :
    new_index = range(1, 2 * len(df)+1 ,2)
    df.index = new_index
    return df

reindex = reindex_dataframe(df)
print(reindex)

print(df)




Original DataFrame:
    A    B    C
0  10   60  110
1  20   70  120
2  30   80  130
3  40   90  140
4  50  100  150
    A    B    C
1  10   60  110
3  20   70  120
5  30   80  130
7  40   90  140
9  50  100  150
    A    B    C
1  10   60  110
3  20   70  120
5  30   80  130
7  40   90  140
9  50  100  150


In [17]:
'''Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
function should print the sum to the console.'''

data = {
    "values" : [10,20,30,40,50]
}
df = pd.DataFrame(data)
print(df)

def sumOfFirstThree(df):
    sum = 0
    for i in range(0,3):
        x = df["values"].values[i]  #you can use iloc insted of values
        sum = x + sum
    return sum

return_sum = sumOfFirstThree(df)
print("\n the sum of first three numbers  :",return_sum)


   values
0      10
1      20
2      30
3      40
4      50
<function sumOfFirstThree at 0x00000277CE755A80>


In [15]:
data = {
    "values" : ["pratish","yash,pratish","john,nancy"]
}
df = pd.DataFrame(data)
print(df)


         values
0       pratish
1  yash,pratish
2    john,nancy


In [18]:
'''Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
'Word_Count' that contains the number of words in each row of the 'Text' column.'''

import pandas as pd

# Sample DataFrame
data = {
    'Text': ['Hello world', 'This is a test', 'Pandas is great', 'Data Science is fun']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Define a function to count the number of words in a string
def count_words(text):
    return len(text.split())

# Apply the function to the 'Text' column
df['Word_Count'] = df['Text'].apply(count_words)

print("\nDataFrame with 'Word_Count' column:")
print(df)


# other method using lambda

data1 = {
    'Text': ['Hello world', 'This is a test', 'Pandas is great', 'Data Science is fun']
}
df1 = pd.DataFrame(data1)

def count_words1(df):
    df1["new_column"] = df1["Text"].apply(lambda x : len(x.split()))
    return df

print(count_words1(df1))


Original DataFrame:
                  Text
0          Hello world
1       This is a test
2      Pandas is great
3  Data Science is fun

DataFrame with 'Word_Count' column:
                  Text  Word_Count
0          Hello world           2
1       This is a test           4
2      Pandas is great           3
3  Data Science is fun           4
                  Text  new_column
0          Hello world           2
1       This is a test           4
2      Pandas is great           3
3  Data Science is fun           4


In [20]:
'''Q5. How are DataFrame.size() and DataFrame.shape() different?'''

import pandas as pd

# Sample DataFrame
data = {
    'A': [1, 2, 3],
    'B': [4, 5, 6],
    'C': [7, 8, 9]
}

df = pd.DataFrame(data)

# Using DataFrame.size
total_elements = df.size
print("DataFrame.size:", total_elements)

# Using DataFrame.shape
shape = df.shape
print("DataFrame.shape:", shape)


DataFrame.size: 9
DataFrame.shape: (3, 3)


### Understanding DataFrame.size and DataFrame.shape

#### <small>DataFrame.size</small>

- **Definition**: Returns the total number of elements in the DataFrame.
- **Type**: Integer.
- **Calculation**: This is computed as the number of rows multiplied by the number of columns.

#### <small>DataFrame.shape</small>

- **Definition**: Returns a tuple representing the dimensionality of the DataFrame.
- **Type**: Tuple.
- **Calculation**: The first element of the tuple is the number of rows, and the second element is the number of columns.


In [None]:
'''Q6. Which function of pandas do we use to read an excel file?'''

pd.read_excel(
    io, 
    sheet_name=0, 
    header=0, 
    names=None, 
    index_col=None, 
    usecols=None, 
    engine=None, 
    converters=None, 
    dtype=None, 
    skiprows=None, 
    nrows=None, 
    na_values=None, 
    keep_default_na=True, 
    verbose=False, 
    parse_dates=False, 
    date_parser=None, 
    thousands=None, 
    convert_float=True, 
    sheet_name='Sheet1'
)
'

### Key Parameters of `pd.read_excel()`

The `pd.read_excel()` function is used to read data from an Excel file into a Pandas DataFrame. Here are some key parameters you can use:

- **`io`**: 
  - **Definition**: The file path or buffer object to read the Excel file from.
  - **Type**: String (path to the file) or file-like object.

- **`sheet_name`**: 
  - **Definition**: The name or index of the sheet to read. You can specify the sheet by name or index (0 for the first sheet). You can also pass a list of names or indexes to read multiple sheets.
  - **Type**: String (sheet name), integer (sheet index), or list of strings/integers.

- **`header`**: 
  - **Definition**: The row number(s) to use as the column names. Default is `0`, which means the first row is used.
  - **Type**: Integer or list of integers.

- **`index_col`**: 
  - **Definition**: Column(s) to set as the index.
  - **Type**: Integer, string, or list of integers/strings.

- **`usecols`**: 
  - **Definition**: Columns to read. Can be specified by column names or indices.
  - **Type**: String, list of strings, integer, or list of integers.

- **`dtype`**: 
  - **Definition**: Data type for the resulting DataFrame.
  - **Type**: Dictionary with column names as keys and data types as values.


In [4]:
'''Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
addresses in the format 'username@domain.com'. Write a Python function that creates a new column
'Username' in df that contains only the username part of each email address. '''

import pandas as pd

# Sample DataFrame
data = {
    'email': ['alice@example.com', 'bob@example.org', 'charlie@domain.net']
}

df = pd.DataFrame(data)
print(df)

def extact_name(email):
    return email.split("@")[0]  #split() return function , so the 0th value is before @ 

df["username"] = df["email"].apply(extact_name)
print(df)

                email
0   alice@example.com
1     bob@example.org
2  charlie@domain.net
                email username
0   alice@example.com    alice
1     bob@example.org      bob
2  charlie@domain.net  charlie


In [11]:
'''Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
function should return a new DataFrame that contains only the selected rows.
For example, if df contains the following values:
A B C
0 3 5 1
1 8 2 7
2 6 9 4
3 2 3 5
4 9 1 2 '''


# Create the DataFrame
data = {
    'A': [3, 8, 6, 2, 9],
    'B': [5, 2, 9, 3, 1],
    'C': [1, 7, 4, 5, 2]
}

df = pd.DataFrame(data)
print(df)



   A  B  C
0  3  5  1
1  8  2  7
2  6  9  4
3  2  3  5
4  9  1  2


In [12]:
df[(df['A'] > 5) & (df['B'] > 5)]

Unnamed: 0,A,B,C
2,6,9,4


6.0

In [16]:
'''Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean,
median, and standard deviation of the values in the 'Values' column.'''



data = {
    'values': [3, 8, 6, 2, 9]
}

df = pd.DataFrame(data)

def central_tendencies(df) :
    mean = df["values"].mean()
    median = df["values"].median()
    std = df["values"].std()
    return mean,median,std

mean,median,std = central_tendencies(df)
print("Mean :",mean)
print("median : ", median)
print("std : ",std)

Mean : 5.6
median :  6.0
std :  3.0495901363953815


In [17]:
pd.date_range(start='2023-01-01', periods=10, freq='D')

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10'],
              dtype='datetime64[ns]', freq='D')

In [18]:
'''Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
should include the current day.'''

import pandas as pd

# Sample DataFrame
data = {
    'Date': pd.date_range(start='2023-01-01', periods=10, freq='D'),
    'Sales': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
}

df = pd.DataFrame(data)

def add_moving_average(df, column_name, window_size=7):
    """
    Add a column to the DataFrame with the moving average of the specified column.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    column_name (str): Name of the column for which to calculate the moving average.
    window_size (int): Size of the moving average window. Default is 7.

    Returns:
    pd.DataFrame: DataFrame with the new 'MovingAverage' column.
    """
    df['MovingAverage'] = df[column_name].rolling(window=window_size, min_periods=1).mean()
    return df

# Add the moving average column
df = add_moving_average(df, 'Sales')

print(df)


        Date  Sales  MovingAverage
0 2023-01-01    100          100.0
1 2023-01-02    150          125.0
2 2023-01-03    200          150.0
3 2023-01-04    250          175.0
4 2023-01-05    300          200.0
5 2023-01-06    350          225.0
6 2023-01-07    400          250.0
7 2023-01-08    450          300.0
8 2023-01-09    500          350.0
9 2023-01-10    550          400.0


df[column_name].rolling(window=window_size, min_periods=1).mean() calculates the rolling mean (moving average) of the specified column over the given window size. The min_periods=1 argument ensures that the function calculates the mean even if there are fewer than 7 values at the start of the series.

In [19]:
'''Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new
column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
Monday, Tuesday) corresponding to each date in the 'Date' column.
For example, if df contains the following values:
Date
0 2023-01-01
1 2023-01-02
2 2023-01-03
3 2023-01-04
4 2023-01-05
Your function should create the following DataFrame:

Date Weekday
0 2023-01-01 Sunday
1 2023-01-02 Monday
2 2023-01-03 Tuesday
3 2023-01-04 Wednesday
4 2023-01-05 Thursday
The function should return the modified DataFrame.'''

import pandas as pd

# Sample DataFrame
data = {
    'Date': pd.date_range(start='2023-01-01', periods=5, freq='D')
}

df = pd.DataFrame(data)

def add_weekday_column(df, date_column):
    """
    Add a column to the DataFrame with the weekday name for each date.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    date_column (str): Name of the column containing dates.

    Returns:
    pd.DataFrame: DataFrame with the new 'Weekday' column.
    """
    df['Weekday'] = df[date_column].dt.day_name()
    return df

# Add the weekday column
df = add_weekday_column(df, 'Date')

print(df)


        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


In [20]:
'''Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
function to select all rows where the date is between '2023-01-01' and '2023-01-31'.'''

import pandas as pd

# Sample DataFrame with timestamp data
data = {
    'Date': pd.date_range(start='2022-12-25', periods=10, freq='D'),
    'Values': range(10)
}

df = pd.DataFrame(data)

def select_dates_between(df, date_column, start_date, end_date):
    """
    Select all rows where the date is between the specified start and end dates.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data.
    date_column (str): Name of the column containing dates.
    start_date (str): The start date in 'YYYY-MM-DD' format.
    end_date (str): The end date in 'YYYY-MM-DD' format.

    Returns:
    pd.DataFrame: DataFrame with rows where the date is between the start and end dates.
    """
    mask = (df[date_column] >= start_date) & (df[date_column] <= end_date)
    return df[mask]

# Select rows where the date is between '2023-01-01' and '2023-01-31'
filtered_df = select_dates_between(df, 'Date', '2023-01-01', '2023-01-31')

print(filtered_df)


        Date  Values
7 2023-01-01       7
8 2023-01-02       8
9 2023-01-03       9


'''Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to
be imported?'''

The first and foremost necessary library that needs to be imported to use the basic functions of pandas is the `pandas` library itself. Typically, it is imported with the alias `pd`. Additionally, if you are working with numerical data or performing numerical operations, it's common to also import the `numpy` library with the alias `np`.

Here’s how you typically start a pandas script:

### Example

```python
import pandas as pd
import numpy as np  # Optional, but commonly used with pandas

# Now you can use pandas functions with the pd alias
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df)
```

### Explanation

- `import pandas as pd`: This imports the pandas library and allows you to use the alias `pd` to refer to it. This is a standard convention in the Python data science community.
- `import numpy as np`: While not strictly necessary for basic pandas functionality, `numpy` is often used in conjunction with pandas for numerical operations and array manipulations.

By importing pandas with the `pd` alias, you can easily access and utilize all of the powerful data manipulation and analysis tools provided by the pandas library.