# Basic Pandas Functions with Examples

## 1. Data Loading and Saving

In [None]:
import pandas as pd

# Create a sample DataFrame, Note that df made by hand is either a dictionary or a list of lists. 
sample_data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [23, 35, 45, 22, 30],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [70000, 80000, 120000, 90000, 85000]
}

# Convert the dictionary to a DataFrame
df = pd.DataFrame(sample_data)

# Display the DataFrame
print(df)


In [None]:
# Display the DataFrame
df

## 2. Data Inspection

In [None]:
# Display the first few rows
print("Displaying the first few rows of the DataFrame:")
print(df.head())
print("############################################################")

# Show summary statistics
print("Summary statistics of the DataFrame:")
print(df.describe())
print("############################################################")

# Show DataFrame info
print("Information about the DataFrame (including data types and non-null counts):")
df.info()  # info() directly prints the output, no need for print()
print("############################################################")

# Check the DataFrame's shape
print("The shape of the DataFrame (rows, columns):")
print(df.shape)
print("############################################################")

# Display column names
print("Column names of the DataFrame:")
print(df.columns)
print("############################################################")

# Check the data types of each column
print("Data types of each column in the DataFrame:")
print(df.dtypes)
print("############################################################")



## 3. Data Selection and Filtering

In [None]:
# Select a column
print("Selecting the 'City' column:")
column = df['City']
print(column)
print("############################################################")

# Select multiple columns
print("Selecting multiple columns: 'Name' and 'Age':")
multiple_columns = df[['Name', 'Age']]
print(multiple_columns)
print("############################################################")

# Select rows by index
print("Selecting a row by index (index 0 corresponds to Alice):")
row = df.iloc[0]  # Alice is at index 0
print(row)
print("############################################################")

# Select rows by label
print("Selecting specific columns for row with label/index 1 (which is Bob):")
label_row = df.loc[1, ['Name', 'City', 'Salary']]  # Row 1 is Bob
print(label_row)
print("############################################################")

# Access a single value by row and column position
print("Accessing a single value by row and column position (Row 0, Column 0):")
value = df.iat[0, 0]
print(value)
print("############################################################")



## 4. Data Cleaning

In [None]:
import pandas as pd
import numpy as np

# Create a sample DataFrame with missing values, specific values to replace, and duplicate rows
sample_data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Bob'],
    'Age': [23, np.nan, 45, 22, 30, np.nan],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Los Angeles'],
    'Salary': [70000, 80000, 120000, np.nan, 85000, 80000]
}

df = pd.DataFrame(sample_data)

# Display the DataFrame
print(df)

In [None]:
# Remove missing values
print("Removing rows with missing values (NaN):")
df_cleaned = df.dropna()
print(df_cleaned)
print("############################################################")

# Fill missing values
print("Filling missing values (NaN) with 0:")
df_filled = df.fillna(0)
print(df_filled)
print("############################################################")

# Replace specific values
print("Replacing occurrences of 'old_value' with 'new_value':")
df_replaced = df.replace('Los Angeles', 'Los Angeles-2')
print(df_replaced)
print("############################################################")

# Remove duplicate rows
print("Removing duplicate rows from the DataFrame:")
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)
print("############################################################")


## 5. Data Aggregation

In [None]:
import pandas as pd
import numpy as np

# Create a sample DataFrame for aggregation and grouping examples
sample_data = {
    'Department': ['HR', 'Finance', 'IT', 'HR', 'IT', 'Finance', 'HR'],
    'Employees': [5, 10, 25, 8, 15, 12, 9],
    'Salary': [50000, 60000, 120000, 55000, 110000, 65000, 52000],
    'Bonus': [5000, 6000, 12000, 5500, 11000, np.nan, 5200]
}

df = pd.DataFrame(sample_data)

# Display the DataFrame
print(df)

In [None]:
df

In [None]:
# Sum values
print("Summing the values in the 'Salary' column:")
total_sum = df['Salary'].sum()
print(f"Total sum of Salary: {total_sum}")
print("############################################################")

# Mean values
print("Calculating the mean (average) of the 'Salary' column:")
mean_value = df['Salary'].mean()
print(f"Mean Salary: {mean_value}")
print("############################################################")

# Group by a column and compute the mean
print("Grouping by the 'Department' column and computing the mean of each group:")
grouped_mean = df.groupby('Department').mean(numeric_only=True)
print(grouped_mean)
print("############################################################")

# Count non-null values
print("Counting non-null values in the 'Bonus' column:")
count_values = df['Bonus'].count()
print(f"Count of non-null Bonus values: {count_values}")
print("############################################################")


## 6. Data Manipulation

In [None]:
import pandas as pd

# Create a sample DataFrame for sorting and custom function application
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [23, 35, 45, 22, 30],
    'Salary': [70000, 80000, 120000, 90000, 85000],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'Finance']
}

df = pd.DataFrame(data)

# Create a second DataFrame for concatenation and appending
data_new = {
    'Name': ['Frank', 'Grace'],
    'Age': [40, 29],
    'Salary': [95000, 67000],
    'Department': ['HR', 'Finance']
}
df_new = pd.DataFrame(data_new)

In [None]:
df

In [None]:
df_new

In [None]:
# Sort by column values
print("Sorting the DataFrame by the 'Department' column:")
df_sorted = df.sort_values(by='Department')
print(df_sorted)
print("############################################################")

# Apply a custom function to each element
print("Doubling the values in the 'Salary' column using a custom function:")
df['Salary'] = df['Salary'].apply(lambda x: x * 2)
print(df)
print("############################################################")

# Concatenate two DataFrames
print("Concatenating two DataFrames (df and df_new):")
df_combined = pd.concat([df, df_new])
print(df_combined)
print("############################################################")

# Create a pivot table
print("Creating a pivot table with 'Department' as the index and summing 'Salary':")
pivot_table = df.pivot_table(index='Department', values='Salary', aggfunc='sum')
print(pivot_table)
print("############################################################")

# Transpose the DataFrame
print("Transposing the DataFrame (switching rows and columns):")
df_transposed = df.T
print(df_transposed)
print("############################################################")


## 7. Indexing and Renaming

In [None]:
import pandas as pd

# Create a sample DataFrame for sorting and custom function application
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [23, 35, 45, 22, 30],
    'Salary': [70000, 80000, 120000, 90000, 85000],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'Finance']
}

df = pd.DataFrame(data)

In [None]:
df

In [None]:
# Set a column as the index
print("Setting the 'Name' column as the index of the DataFrame:")
df_indexed = df.set_index('Name')
print(df_indexed)
print("############################################################")

# Reset the index
print("Resetting the index to return the 'Name' column back as a normal column:")
df_reset = df_indexed.reset_index()
print(df_reset)
print("############################################################")

# Rename columns
print("Renaming the 'Department' column to 'Department_name':")
df_renamed = df.rename(columns={'Department': 'Department_name'})
print(df_renamed)
print("############################################################")


## 8. String Operations

In [None]:
import pandas as pd

# Create a sample DataFrame for sorting and custom function application
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [23, 35, 45, 22, 30],
    'Salary': [70000, 80000, 120000, 90000, 85000],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'Finance']
}

df = pd.DataFrame(data)

In [None]:
# Convert strings to lowercase
print("Converting the 'Name' column to lowercase and storing it in a new column 'name_lower':")
df['name_lower'] = df['Name'].str.lower()
print(df[['Name', 'name_lower']])
print("############################################################")

# Check if strings contain a specific pattern
print("Checking if the 'Department' column contains the string 'HR':")
contains_pattern = df['Department'].str.contains('HR')
print(contains_pattern)
print("############################################################")

# Replace occurrences of a substring
print("Replacing 'HR' with 'Human Resources' in the 'Department' column and storing the result in a new column 'Department_2':")
df['Department_2'] = df['Department'].str.replace('HR', 'Human Resources')
print(df[['Department', 'Department_2']])
print("############################################################")

# Remove leading and trailing spaces
print("Removing leading and trailing spaces from the 'Name' column and storing it in a new column 'Name_stripped':")
df['Name_stripped'] = df['Name'].str.strip()
print(df[['Name', 'Name_stripped']])
print("############################################################")


## 9. Mathematical Operations

In [None]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Product': ['A', 'B', 'C', 'D', 'E'],
    'Sales_Q1': [150, 200, 300, 100, 250],
    'Sales_Q2': [180, 220, 320, 110, 270]
}

df = pd.DataFrame(data)

In [None]:
df

In [None]:
# Element-wise addition (add Sales_Q1 and Sales_Q2)
print("Performing element-wise addition of 'Sales_Q1' and 'Sales_Q2' to create 'Total_Sales':")
df['Total_Sales'] = df['Sales_Q1'] + df['Sales_Q2']
print(df)
print("############################################################")

# Cumulative sum of 'Total_Sales' (cumulative sum is a running total)
print("Calculating the cumulative sum of 'Total_Sales' (running total) and storing it in 'Cumulative_Sales':")
df['Cumulative_Sales'] = df['Total_Sales'].cumsum()
print(df)
print("############################################################")

# Cumulative max of 'Total_Sales' (shows the max value up to this point)
print("Calculating the cumulative maximum of 'Total_Sales' (the maximum value encountered so far) and storing it in 'Cumulative_Max_Sales':")
df['Cumulative_Max_Sales'] = df['Total_Sales'].cummax()
print(df)
print("############################################################")


## 10. Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Create a sample DataFrame
data = {
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
    'Sales': [200, 250, 300, 350, 400, 450],
    'Profit': [20, 30, 50, 60, 70, 80]
}

df = pd.DataFrame(data)

In [None]:
df

In [None]:
import matplotlib.pyplot as plt

# Basic line plot (plot 'Sales' over the months)
print("Creating a basic line plot for 'Sales' over the months:")
df['Sales'].plot()
plt.title('Monthly Sales')
plt.xlabel('Index')
plt.ylabel('Sales')
plt.show()
print("############################################################")

# Scatter plot (Sales vs Profit)
print("Creating a scatter plot to visualize the relationship between 'Sales' and 'Profit':")
df.plot(kind='scatter', x='Sales', y='Profit')
plt.title('Sales vs Profit')
plt.xlabel('Sales')
plt.ylabel('Profit')
plt.show()
print("############################################################")

# Histogram of 'Sales'
print("Creating a histogram to show the distribution of 'Sales':")
df['Sales'].plot(kind='hist')
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.show()
print("############################################################")
