# Testing Workbook for AI/ML Course


In [None]:
# Import statements
import pandas as pd

# Pandas Notes

## Data Types: Series
- Series: 1 dimension of data

In [None]:
# Create a sample Series
# A Series is a one-dimensional array-like object that can hold any data type (integers, strings, floating-point numbers, etc.) and has an associated index.
series = pd.Series([1, 2, 3, 4, 5])
print(series)

## Data Types: DateFrame
    - DataFrame = n-dimensional array (usually 2D Array)
    - Vectorized

- Anatomy of a DataFrame
        Name    Age         City
    0    Alice   25     New York
    1      Bob   30  Los Angeles
    2  Charlie   35      Chicago
    3    David   40      Houston
    4      Eve   45      Phoenix
- Index: (0-1), Left most column
    - Index gets treated as a column. When exporting: use arg {index=False} to avoid re-indexing
- Headers:
- Data:

In [None]:
# Create a sample DataFrame
# This DataFrame contains three columns: 'Name', 'Age', and 'City', with 5 rows of data.
sampleDf = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
})
print(sampleDf)

In [None]:
# Read from csv
car_data = pd.read_csv('./data/car-sales.csv')
display(car_data.head())

## Describe data

In [None]:
# Provides statistical summary of numeric columns in the DataFrame
sampleDf.describe()

# Provides information about the value types, including the number of non-null entries, data types, and memory usage
sampleDf.dtypes

# Returns a list of column names in the DataFrame
columnNames = sampleDf.columns
print(columnNames)

# Returns the index range of the DataFrame, which is a sequence of integers by default
sampleDf.index

# Info: Provides a concise summary of the DataFrame, including the number of non-null entries, data types, and memory usage
sampleDf.info()

# Shape: Returns a tuple representing the dimensionality of the DataFrame (number of rows, number of columns)
print(sampleDf.shape)

# Len: Returns the number of rows in the DataFrame
print(len(sampleDf))
# Count: Returns the number of non-null entries in each column of the DataFrame
sampleDf.count()

In [None]:
# Stastical analysis methods: mean, median, mode, min, max, std, var, quantile
    # Requires numeric data, so we will apply these methods to the numeric columns of the sample DataFrame
print(sampleDf.mean()) # Calculate the mean of the 'Age' column in the sample DataFrame
print(sampleDf.median()) # Calculate the median of the 'Age' column in the sample DataFrame
print(sampleDf.mode()) # Calculate the mode of the 'Age' column in the sample DataFrame (note: mode can return multiple values if there are ties)
print(sampleDf.min()) # Calculate the minimum value of the 'Age' column in the sample DataFrame
print(sampleDf.max()) # Calculate the maximum value of the 'Age' column in the sample DataFrame
print(sampleDf.std()) # Calculate the standard deviation of the 'Age' column in the sample DataFrame
print(sampleDf.var()) # Calculate the variance of the 'Age' column in the sample DataFrame
print(sampleDf.quantile(0.25))  # 25th percentile
print(sampleDf.quantile(0.5))   # 50th percentile (median)
print(sampleDf.quantile(0.75))  # 75th percentile

In [None]:
# Mathematical operations: sum, cumsum, min, max, abs, round
    # Requires numeric data, so we will apply these methods to the numeric columns of the sample DataFrame
print(sampleDf.sum())
print(sampleDf.cumsum())
print(sampleDf.abs())
print(sampleDf.round(2))  # Round to 2 decimal places

# Display Data

In [None]:
# Head: Returns the first n rows of the DataFrame (default is 5)
print(sampleDf.head())

# Tail: Returns the last n rows of the DataFrame (default is 5)
print(sampleDf.tail())

In [None]:
# .loc: Access a group of rows and columns by labels or a boolean array
print(sampleDf.loc[0])  # Access the first row by index label
print(sampleDf.loc[:, 'Name'])  # Access the 'Name' column

# .iloc: Access a group of rows and columns by integer position
print(sampleDf.iloc[0])  # Access the first row by integer position
print(sampleDf.iloc[:, 0])  # Access the first column by integer position

In [None]:
# .loc and .iloc - On series w/ Index
# Create a Series with a custom index
custom_series = pd.Series([10, 20, 30, 40, 50], index=[1, 3, 2, 0, 4])
print(custom_series)

# Unordered index access using .loc
print(custom_series.loc[0])  # Access the value at index label 0
print(custom_series.loc[1])  # Access the value at index label 1

# Unordered index access using .iloc
print(custom_series.iloc[0])  # Access the value at integer position 0
print(custom_series.iloc[1])  # Access the value at integer position 1

# .loc refers to the index labels, while .iloc refers to the integer positions of the data.
#    In this example, the index labels are unordered, 
#   so .loc accesses values based on the custom index, while .iloc accesses values based on their position in the Series.

## Working w/ data in Pandas

## NOTE: Changing column names
- change the Price column of the car_sales DataFrame like this: 

car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '')

- However, this will return an error if you’re using a newer version of pandas. Not to worry, the fix is quick! By adding [regex=True] (this tells pandas that the change is a “regular expression” or regex in Python for short) the code will work: 

    car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '', regex=True)

    ## In summary:

    ### Previous code (will error)
    car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '')

    ### New code 
    ### Change Price column to integers
    car_sales["Price"] = car_sales["Price"].str.replace('[\$\,\.]', '', regex=True)

----------------

# Working w/ data in Pandas


In [None]:
# Cross Tab:
# Cross tabulation is a method used to analyze the relationship between two or more categorical variables by creating a contingency table.
# It helps to summarize the data and identify patterns or associations between the variables.
# Example of cross tabulation using pandas

sampleDf['Gender'] = ['F', 'M', 'M', 'M', 'F']  # Adding a new categorical column for demonstration
cross_tab = pd.crosstab(sampleDf['City'], sampleDf['Gender'])
print(cross_tab)

# Groupby: Grouping data based on one or more columns and applying aggregate functions to the groups
grouped = sampleDf.groupby('City')['Age'].mean()  # Group by 'City' and calculate the mean age for each city
print(grouped)

### Plot Data

In [None]:
# .plot(): A method for creating various types of plots and visualizations directly from a DataFrame or Series
# Example of plotting a bar chart of average age by city

import matplotlib_inline # Importing the matplotlib inline backend for Jupyter notebooks

# This line is necessary to display plots inline in Jupyter notebooks
    # The %matplotlib inline magic command is used to ensure that plots generated by matplotlib are displayed directly within the Jupyter notebook, 
    # rather than in a separate window. This allows for a more seamless and interactive experience when working with visualizations in the notebook environment.
%matplotlib inline 

grouped.plot(kind='bar', title='Average Age by City') # Plotting a bar chart of average age by city

### Histogram Plot

In [None]:
# .hist(): A method for creating a histogram, which is a graphical representation of the distribution of a dataset
# Example of plotting a histogram of ages in the sample DataFrame
sampleDf['Age'].hist(title='Age Distribution') # Plotting a histogram of ages in the sample DataFrame

### Handling Strings

In [None]:
# .lower(): A method for converting all characters in a string to lowercase
# .upper(): A method for converting all characters in a string to uppercase
# Example of using .lower() and .upper() on the 'City' column of the sample DataFrame
print(sampleDf['City'].str.lower())  # Convert city names to lowercase
print(sampleDf['City'].str.upper())  # Convert city names to uppercase

### Working w/ Missing Data

In [None]:
# NaN: Represents missing or null values in a DataFrame. It is used to indicate the absence of data in a particular cell.
# Example of handling NaN values in a DataFrame

sampleDf.loc[2, 'Age'] = pd.NA  # Introducing a NaN value in the 'Age' column
print(sampleDf)

# fillna(): A method for filling NaN values with a specified value or using a specified method (e.g., forward fill, backward fill)
# Example of filling NaN values in the 'Age' column with the mean age
mean_age = sampleDf['Age'].mean()  # Calculate the mean age, ignoring NaN values
sampleDf['Age'] = sampleDf['Age'].fillna(mean_age)  # Fill NaN values in the 'Age' column with the mean age
print(sampleDf)

# fillna(inplace=True): A method for filling NaN values in place;
#    meaning that the original DataFrame will be modified directly without needing to assign the result to a new variable
# Example of filling NaN values in the 'Age' column with the mean age in place
sampleDf.loc[2, 'Age'] = pd.NA  # Introducing a NaN value in the 'Age' column again
print(sampleDf)
sampleDf['Age'].fillna(mean_age, inplace=True)  # Fill NaN values in the 'Age' column with the mean age in place
print(sampleDf)

# dropna(): A method for removing rows or columns that contain NaN values
# Example of dropping rows with NaN values in the 'Age' column
sampleDf.loc[3, 'Age'] = pd.NA  # Introducing another NaN value in the 'Age' column
print(sampleDf) 
sampleDf_dropped = sampleDf.dropna(subset=['Age'])  # Drop rows where 'Age' is NaN

# The resulting DataFrame will only include rows where the 'Age' column has non-null values.
#   In this example, the row with index 3 will be removed from the DataFrame because it contains a NaN value in the 'Age' column.
#   dropna() does not reindex the DataFrame after dropping rows, so the original index is preserved. 
#       If you want to reset the index after dropping rows;
#        you can use the reset_index() method with the drop=True parameter to avoid adding the old index as a new column.
print(sampleDf_dropped) 

# Creating Data in Pandas

In [None]:
# Column from series
new_column = pd.Series(['NY', 'CA', 'IL', 'TX', 'AZ'])
sampleDf['State'] = new_column
print(sampleDf)

# Column from list
new_column_list = ['14040', '19654', '22546', '97134', '02110']
sampleDf['ZipCode'] = new_column_list
print(sampleDf)

In [None]:
# Columns from Columns
sampleDf['Location'] = sampleDf['City'] + ', ' + sampleDf['State']
print(sampleDf)

In [None]:
# Creating a new column based on a condition using np.where()
import numpy as np
sampleDf['AgeGroup'] = np.where(sampleDf['Age'] < 30, 'Young', 'Old')
print(sampleDf)

# Adding a computed column based on existing columns
#   Distance from average age
average_age = sampleDf['Age'].mean()
sampleDf['DistanceFromAverageAge'] = sampleDf['Age'] - average_age
print(sampleDf)

# drop(): A method for removing specified labels from rows or columns
# Example of dropping the 'AgeGroup' column from the DataFrame
sampleDf_dropped_column = sampleDf.drop(columns=['AgeGroup'])  # Drop the 'AgeGroup' column
print(sampleDf_dropped_column)

# ML Operations in Pandas 
- Methods to manipulate dataframes for ML/AI operations
    - .sample()

In [None]:
# sample(): A method for generating a random sample of rows from a DataFrame
# Example of taking a random sample of 3 rows from the DataFrame
random_sample = sampleDf.sample(n=3, random_state=42)  # Take a random sample of 3 rows with a fixed random state for reproducibility
print(random_sample)

# Integer sampling: A method for generating a random sample of rows based on proportion of the dataset
# Example of taking a random sample of 50% of the rows from the DataFrame
proportional_sample = sampleDf.sample(frac=0.5, random_state=42)  # Take a random sample of 50% of the rows with a fixed random state for reproducibility
print(proportional_sample)

In [None]:
# Reindexing: A method for resetting the index of a DataFrame, often used after dropping rows or when the index is no longer meaningful
# Example of resetting the index after dropping rows with NaN values in the 'Age' column
sampleDf.loc[3, 'Age'] = pd.NA  # Introducing a NaN value in the 'Age' column again
print(sampleDf)

sampleDf_dropped = sampleDf.dropna(subset=['Age'])  # Drop rows where 'Age' is NaN
print(sampleDf_dropped)

sampleDf_dropped.reset_index(drop=True, inplace=True)  # Reset the index of the DataFrame after dropping rows, dropping the old index
print(sampleDf_dropped)

In [None]:
# .apply(): A method for applying a function along an axis of the DataFrame (e.g., to each column or row)
# Example of using .apply() to calculate the length of each city name in the 'City' column
sampleDf['CityNameLength'] = sampleDf['City'].apply(len)
print(sampleDf)

# .apply() with lambda function: 
#   A method for applying a custom function defined as a lambda function to each element in a column
sampleDf['CityNameLengthLambda'] = sampleDf['City'].apply(lambda x: len(x))
print(sampleDf)

## Lambda functions — quick reference

- **Syntax:** `lambda args: expression` — a small anonymous function that returns the expression.
- **Single-expression only:** Lambdas cannot contain statements (no `if`/`for` blocks, use a `def` for complex logic).

Examples:
```python
# simple lambda
add = lambda x, y: x + y
print(add(2, 3))  # 5

# inline use with sorted/map/filter
pairs = [(1, 'b'), (2, 'a')]
sorted_by_second = sorted(pairs, key=lambda p: p[1])

nums = [1,2,3,4]
doubled = list(map(lambda x: x*2, nums))
evens = list(filter(lambda x: x%2==0, nums))