In [25]:
# Getting Started with Pandas
# Objective: Introduce students to using Pandas for data analysis by loading data into Pandas
# DataFrames.

# Question 1: Importing Pandas and Loading a CSV File
# 1. Open your Jupyter Notebook or a Python environment.
# 2. Import the pandas library.
# 3. Load a CSV file into a DataFrame.

import pandas as pd
try:
    df = pd.read_csv('data.csv')
    print("CSV file loaded successfully into DataFrame 'df'.")
except FileNotFoundError:
    print("Error: 'your_data.csv' not found. Please make sure the file is in the correct directory or provide the full path.")
    df = None  # Assign None to df if loading fails
except Exception as e:
    print(f"An error occurred while loading the CSV file: {e}")
    df = None




# Question 2: Displaying the First Few Rows
# 4. Use the head() method to display the first five rows of the DataFrame.

if df is not None:
    print("\nFirst five rows of the DataFrame:")
    print(df.head())
else:
    print("\nDataFrame 'df' is not loaded, cannot display head().")



# Question 3: Basic Data Information
# 5. Use the info() method to get a concise summary of the DataFrame.

if df is not None:
    print("\nConcise summary of the DataFrame:")
    df.info()
else:
    print("\nDataFrame 'df' is not loaded, cannot display info().")

CSV file loaded successfully into DataFrame 'df'.

First five rows of the DataFrame:
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   28     Paris
3    David   35    Berlin
4      Eve   22     Tokyo

Concise summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes


In [26]:
# Data Inspection & Selection
# Objective: Learn how to inspect data and select specific data points.

# Question 1: Inspecting Column Data Types
# 6. Use the dtypes attribute to inspect the data types of each column.

if df is not None:
    print("\nData types of each column:")
    print(df.dtypes)
else:
    print("\nDataFrame 'df' is not loaded, cannot display dtypes.")


# Question 2: Selecting Columns
# 7. Select a single column from the DataFrame.

if df is not None:
    if 'Name' in df.columns:
        name_column = df['Name']
        print("\nFirst 5 values of the 'Name' column:")
        print(name_column.head())
        print("\nData type of the 'Name' column:")
        print(name_column.dtype)
    else:
        print("\nError: Column 'Name' not found in the DataFrame.")
else:
    print("\nDataFrame 'df' is not loaded, cannot select columns.")





# Question 3: Slicing Rows
# 8. Select specific rows using slicing.

if df is not None:
    print("\nFirst 3 rows of the DataFrame:")
    print(df[0:3])  # Select rows from index 0 up to (but not including) 3

    print("\nRows starting from the 2nd row (index 1):")
    print(df[1:])

    print("\nEvery other row (first 5):")
    print(df[::2].head())
else:
    print("\nDataFrame 'df' is not loaded, cannot slice rows.")





Data types of each column:
Name    object
Age      int64
City    object
dtype: object

First 5 values of the 'Name' column:
0      Alice
1        Bob
2    Charlie
3      David
4        Eve
Name: Name, dtype: object

Data type of the 'Name' column:
object

First 3 rows of the DataFrame:
      Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   28     Paris

Rows starting from the 2nd row (index 1):
      Name  Age    City
1      Bob   30  London
2  Charlie   28   Paris
3    David   35  Berlin
4      Eve   22   Tokyo

Every other row (first 5):
      Name  Age      City
0    Alice   25  New York
2  Charlie   28     Paris
4      Eve   22     Tokyo


In [30]:
# Data Cleaning & Manipulation
# Objective: Practice cleaning data and manipulating DataFrames.

# Assuming you have a DataFrame 'df' loaded from a CSV file.

# Let's introduce some missing values for demonstration purposes
import numpy as np
if 'df' in locals():
    df.loc[1, 'Age'] = np.nan
    df.loc[3, 'City'] = np.nan
    if 'Salary' in df.columns:
        df.loc[4, 'Salary'] = np.nan
    else:
        df['Salary'] = np.nan # Add Salary column if it doesn't exist

    print("\nOriginal DataFrame (with introduced missing values):")
    print(df.head())

    # Question 1: Handling Missing Values
    # 9. Use the fillna() method to fill missing values with a specific value.

    # Fill all NaN values with 0
    df_filled_zero = df.fillna(0)
    print("\nDataFrame with NaN values filled with 0:")
    print(df_filled_zero.head())

    # Fill NaN values in specific columns with different values
    df_filled_specific = df.copy() # Create a copy to avoid modifying the previous result
    if 'Age' in df_filled_specific.columns and df_filled_specific['Age'].notna().any():
        df_filled_specific['Age'] = df_filled_specific['Age'].fillna(df_filled_specific['Age'].mean()) # Fill missing 'Age' with the mean
    else:
        print("Warning: Cannot fill missing 'Age' with mean as there are no non-missing values.")

    if 'City' in df_filled_specific.columns:
        df_filled_specific['City'] = df_filled_specific['City'].fillna('Unknown') # Fill missing 'City' with 'Unknown'

    if 'Salary' in df_filled_specific.columns and df_filled_specific['Salary'].notna().any():
        df_filled_specific['Salary'] = df_filled_specific['Salary'].fillna(df_filled_specific['Salary'].median()) # Fill missing 'Salary' with the median
    else:
        print("Warning: Cannot fill missing 'Salary' with median as there are no non-missing values.")

    print("\nDataFrame with NaN values filled with specific values (mean, 'Unknown', median):")
    print(df_filled_specific.head())

    # QUestion 2: Renaming Columns
    # 10. Change the names of specific columns using rename().

    print("\nOriginal column names:")
    print(df.columns)

    df_renamed = df.rename(columns={'Name': 'FullName', 'Age': 'Years'})
    print("\nDataFrame with columns 'Name' renamed to 'FullName' and 'Age' to 'Years':")
    print(df_renamed.head())
    print("\nNew column names:")
    print(df_renamed.columns)

    # Question 3: Dropping Duplicates
    # 11. Remove duplicate rows from the DataFrame.

    # Let's introduce a duplicate row for demonstration
    df_with_duplicates = df.copy()
    if not df_with_duplicates.empty:
        df_with_duplicates = pd.concat([df_with_duplicates, df_with_duplicates.iloc[[0]]], ignore_index=True)
        print("\nDataFrame with a duplicate row:")
        print(df_with_duplicates)

        df_dropped_duplicates = df_with_duplicates.drop_duplicates()
        print("\nDataFrame with duplicate rows dropped:")
        print(df_dropped_duplicates)
    else:
        print("\nDataFrame is empty, cannot demonstrate dropping duplicates.")

else:
    print("\nCannot perform data cleaning and manipulation as DataFrame 'df' is not loaded.")


Original DataFrame (with introduced missing values):
      Name   Age      City  Salary
0    Alice  25.0  New York     NaN
1      Bob   NaN    London     NaN
2  Charlie  28.0     Paris     NaN
3    David  35.0       NaN     NaN
4      Eve  22.0     Tokyo     NaN

DataFrame with NaN values filled with 0:
      Name   Age      City  Salary
0    Alice  25.0  New York     0.0
1      Bob   0.0    London     0.0
2  Charlie  28.0     Paris     0.0
3    David  35.0         0     0.0
4      Eve  22.0     Tokyo     0.0

DataFrame with NaN values filled with specific values (mean, 'Unknown', median):
      Name   Age      City  Salary
0    Alice  25.0  New York     NaN
1      Bob  27.5    London     NaN
2  Charlie  28.0     Paris     NaN
3    David  35.0   Unknown     NaN
4      Eve  22.0     Tokyo     NaN

Original column names:
Index(['Name', 'Age', 'City', 'Salary'], dtype='object')

DataFrame with columns 'Name' renamed to 'FullName' and 'Age' to 'Years':
  FullName  Years      City  Salary


In [32]:
# Data Aggregation & Exporting
# Objective: Aggregate data and export the results.



if 'df' in locals() and not df.empty:
    print("\nOriginal DataFrame (first 5 rows):")
    print(df.head())

    # Question 1: Grouping and Aggregating Data
    # 12. Group data by a specific column and calculate the mean for each group.
    #     Replace 'City' with the column you want to group by and 'Age' (or another numerical column)
    #     with the column you want to calculate the mean of.

    if 'City' in df.columns and any(df['City'].unique()):
        grouped_mean = df.groupby('City')['Age'].mean()
        print("\nMean Age per City:")
        print(grouped_mean)
    else:
        print("\nCannot group by 'City' as the column does not exist or has no unique values.")

    # Question 2: Exporting Data to CSV
    # 13. Export the DataFrame to a new CSV file.
    #     Replace 'output.csv' with the desired name for your output file.
    try:
        df.to_csv('output.csv', index=False)  # index=False prevents writing the DataFrame index to the CSV
        print("\nDataFrame exported to 'output.csv' successfully.")
    except Exception as e:
        print(f"\nError exporting DataFrame to CSV: {e}")

    # Question 3: Aggregating with Multiple Functions
    # 14. Apply several aggregate functions to the grouped data.
    #     Again, replace 'City' with your grouping column and 'Age' (and potentially 'Salary')
    #     with the numerical columns you want to aggregate.

    if 'City' in df.columns and any(df['City'].unique()):
        grouped_agg = df.groupby('City').agg(
            mean_age=('Age', 'mean'),
            median_age=('Age', 'median'),
            max_age=('Age', 'max'),
            min_age=('Age', 'min'),
            mean_salary=('Salary', 'mean'),
            total_salary=('Salary', 'sum')
        )
        print("\nAggregated data per City (mean, median, max, min age; mean, total salary):")
        print(grouped_agg)
    else:
        print("\nCannot perform multi-function aggregation by 'City' as the column does not exist or has no unique values.")

else:
    print("\nCannot perform data aggregation and exporting as DataFrame 'df' is not loaded or is empty.")







Original DataFrame (first 5 rows):
      Name   Age      City  Salary
0    Alice  25.0  New York     NaN
1      Bob   NaN    London     NaN
2  Charlie  28.0     Paris     NaN
3    David  35.0       NaN     NaN
4      Eve  22.0     Tokyo     NaN

Mean Age per City:
City
London       NaN
New York    25.0
Paris       28.0
Tokyo       22.0
Name: Age, dtype: float64

DataFrame exported to 'output.csv' successfully.

Aggregated data per City (mean, median, max, min age; mean, total salary):
          mean_age  median_age  max_age  min_age  mean_salary  total_salary
City                                                                       
London         NaN         NaN      NaN      NaN          NaN           0.0
New York      25.0        25.0     25.0     25.0          NaN           0.0
Paris         28.0        28.0     28.0     28.0          NaN           0.0
Tokyo         22.0        22.0     22.0     22.0          NaN           0.0
