<img src="../img/UP Data Science Society Logo 2.png" width=700 />

# [3.1] Introduction to Pandas

**Prepared by**:
- Yuan Labuguen

**Weekly Objectives**:
- Learn to load and manipulate data using Pandas
- Understand basic Pandas data structures and operations
- Master data analysis and transformation techniques with Pandas

## 1. Loading and Reading Data with Pandas

Let's start by importing Pandas and learning different ways to load data.

In [None]:
# Import pandas
import pandas as pd
import numpy as np

# Load data from CSV
df = pd.read_csv("./Data/Stress_Dataset.csv")

# Display the first few rows
print("First few rows of the dataset:")
print(df.head())

# Create a DataFrame from a dictionary
data_dict = {
    'Name': ['John', 'Anna', 'Peter'],
    'Age': [25, 28, 30],
    'City': ['New York', 'Paris', 'London']
}
df_dict = pd.DataFrame(data_dict)
print("\nDataFrame from dictionary:")
print(df_dict)

## 2. Understanding DataFrame Basics

Let's explore the basic properties and methods of a Pandas DataFrame.

In [None]:
# Basic DataFrame information
print("DataFrame Info:")
print(df.info())

# Basic statistics of numeric columns
print("\nBasic Statistics:")
print(df.describe())

# DataFrame dimensions
print("\nDataFrame Shape (rows, columns):", df.shape)

# Column names
print("\nColumn Names:", df.columns.tolist())

# Data types of columns
print("\nData Types:")
print(df.dtypes)

## 3. Data Selection and Indexing

Learn different ways to select and index data in a DataFrame.

In [None]:
# Select a single column
print("Single column selection:")
print(df.iloc[:, 0])  # First column

# Select multiple columns
print("\nMultiple column selection:")
print(df.iloc[:, [0, 1]])  # First two columns

# Select rows by index
print("\nRow selection by index:")
print(df.iloc[0:3])  # First three rows

# Select specific rows and columns
print("\nSpecific rows and columns:")
print(df.iloc[0:3, 0:2])  # First three rows and first two columns

# Using loc for label-based indexing
print("\nLabel-based indexing with column names:")
print(df.loc[:, df.columns[0:2]])  # First two columns by name

## 4. Data Filtering and Boolean Indexing

Learn how to filter data using boolean conditions.

In [None]:
# Simple boolean condition
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    first_numeric = numeric_cols[0]
    condition = df[first_numeric] > df[first_numeric].mean()
    print(f"Rows where {first_numeric} is above mean:")
    print(df[condition].head())

# Multiple conditions
if len(numeric_cols) > 1:
    second_numeric = numeric_cols[1]
    condition1 = df[first_numeric] > df[first_numeric].mean()
    condition2 = df[second_numeric] < df[second_numeric].median()
    print(f"\nRows where {first_numeric} is above mean AND {second_numeric} is below median:")
    print(df[condition1 & condition2].head())

# Using isin()
if len(df) > 5:
    values_to_check = df.iloc[0:5, 0].tolist()
    print("\nRows where first column values are in the first 5 values:")
    print(df[df.iloc[:, 0].isin(values_to_check)].head())

## 5. Basic Data Analysis Operations

Explore basic statistical operations and data analysis techniques.

In [None]:
# Basic statistics for numeric columns
numeric_cols = df.select_dtypes(include=[np.number])
if not numeric_cols.empty:
    print("Basic Statistics for Numeric Columns:")
    print("\nMean:")
    print(numeric_cols.mean())
    print("\nMedian:")
    print(numeric_cols.median())
    print("\nStandard Deviation:")
    print(numeric_cols.std())

# Value counts for categorical columns
categorical_cols = df.select_dtypes(exclude=[np.number])
if not categorical_cols.empty:
    first_cat_col = categorical_cols.columns[0]
    print(f"\nValue counts for {first_cat_col}:")
    print(df[first_cat_col].value_counts())

# Sorting data
if len(numeric_cols.columns) > 0:
    sort_col = numeric_cols.columns[0]
    print(f"\nTop 5 rows sorted by {sort_col}:")
    print(df.sort_values(by=sort_col, ascending=False).head())

## 6. Handling Missing Data

Learn how to identify and handle missing data in your DataFrame.

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Create a sample DataFrame with missing values
df_missing = df.copy()
if len(df) > 0 and len(df.columns) > 0:
    df_missing.iloc[0:3, 0] = np.nan

# Different ways to handle missing values
print("\nDropping rows with missing values:")
print(df_missing.dropna().head())

print("\nFilling missing values with mean (numeric columns):")
numeric_cols = df_missing.select_dtypes(include=[np.number])
if not numeric_cols.empty:
    df_filled = df_missing.copy()
    for col in numeric_cols.columns:
        df_filled[col] = df_filled[col].fillna(df_filled[col].mean())
    print(df_filled.head())

## 7. Grouping and Aggregation

Learn how to group data and perform aggregation operations.

In [None]:
# Basic groupby operations
if len(df.columns) >= 2:
    # Choose the first categorical column for grouping
    cat_cols = df.select_dtypes(exclude=[np.number]).columns
    num_cols = df.select_dtypes(include=[np.number]).columns
    
    if len(cat_cols) > 0 and len(num_cols) > 0:
        group_col = cat_cols[0]
        agg_col = num_cols[0]
        
        print(f"Grouping by {group_col} and calculating mean of {agg_col}:")
        print(df.groupby(group_col)[agg_col].mean())
        
        # Multiple aggregations
        print(f"\nMultiple aggregations for {agg_col}:")
        print(df.groupby(group_col)[agg_col].agg(['mean', 'count', 'std']))
        
        # Group by multiple columns
        if len(cat_cols) >= 2:
            print(f"\nGrouping by multiple columns ({cat_cols[0]} and {cat_cols[1]}):")
            print(df.groupby([cat_cols[0], cat_cols[1]])[agg_col].mean())

## 8. Data Manipulation and Transformation

Learn how to manipulate and transform your data using Pandas.

In [None]:
# Adding a new column
if len(df.columns) > 0 and len(df) > 0:
    numeric_cols = df.select_dtypes(include=[np.number])
    if not numeric_cols.empty:
        col = numeric_cols.columns[0]
        df['new_column'] = df[col] * 2
        print("DataFrame with new column:")
        print(df.head())

        # Applying a function to a column
        def custom_function(x):
            return x * 10

        df['custom_transformation'] = df[col].apply(custom_function)
        print("\nDataFrame with custom transformation:")
        print(df.head())

        # Removing columns
        df = df.drop(['new_column', 'custom_transformation'], axis=1)
        print("\nDataFrame after removing new columns:")
        print(df.head())

# Applying operations to multiple columns
if len(numeric_cols.columns) >= 2:
    print("\nApplying operations to multiple columns:")
    col1, col2 = numeric_cols.columns[:2]
    df['sum_columns'] = df[col1] + df[col2]
    print(df.head())
    df = df.drop('sum_columns', axis=1)  # Clean up