In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# --- 1. Prepare Sample Data ---
data = {
    'ID': [1, 2, 3, 4, 5, 6],
    'RawText': [
        '  Data Science is Fun! ',
        'pandas makes data easy',
        ' Regular Expressions are POWERFUL',
        'missing data? handle it.',
        ' Learn Pandas & NumPy ',
        'Categorical Data is Efficient '
    ],
    'Codes': ['A-101', 'B-203', 'A-105', 'C-300', 'B-203', 'A-101'],
    'Grade': ['Good', 'Excellent', 'Fair', 'Good', 'Excellent', 'Good'],
    'Values': [10.5, 20.1, 15.6, 8.0, 20.1, 12.3]
}
df = pd.DataFrame(data)

print("--- Sample DataFrame ---")
print(df)
print("\nInitial dtypes:\n", df.dtypes)
print("-" * 30)


# --- 2. Working with Text Data (.str accessor) ---
# Access vectorized string methods on Series with object/string dtype using .str

print("--- String Operations (.str) ---")
text_series = df['RawText']
print("Original Text Series:\n", text_series)

# a) Case conversion
print("\nLowercase (.str.lower()):\n", text_series.str.lower())
print("\nUppercase (.str.upper()):\n", text_series.str.upper())
print("\nTitle case (.str.title()):\n", text_series.str.title())

# b) Stripping whitespace
print("\nStripped whitespace (.str.strip()):\n", text_series.str.strip())
print("\nStripped left whitespace (.str.lstrip()):\n", text_series.str.lstrip())
print("\nStripped right whitespace (.str.rstrip()):\n", text_series.str.rstrip())

# c) Splitting strings (.str.split())
# Splits strings based on a delimiter, returns a Series of lists
split_by_space = text_series.str.strip().str.split(' ')
print("\nSplit by space (.str.split(' ')):\n", split_by_space)

# Split 'Codes' by '-' and expand into new columns
code_split = df['Codes'].str.split('-', expand=True) # expand=True creates a DataFrame
code_split.columns = ['CodePrefix', 'CodeNumber'] # Name the new columns
print("\nSplitting 'Codes' into new columns:\n", code_split)
# Can join this back to the original DataFrame if needed: df = df.join(code_split)

# d) Checking for substrings (.str.contains())
contains_pandas = text_series.str.lower().str.contains('pandas')
print("\nContains 'pandas' (case-insensitive) (.str.contains()):\n", contains_pandas)
print("\nRows containing 'pandas':\n", df[contains_pandas])

# Check start/end (.str.startswith(), .str.endswith()) - after stripping whitespace
starts_with_learn = text_series.str.strip().str.startswith('Learn')
print("\nStarts with 'Learn' (.str.startswith()):\n", starts_with_learn)

# e) Replacing substrings (.str.replace())
replaced_pandas = text_series.str.replace('Pandas', 'Awesome Pandas')
print("\nReplaced 'Pandas' (.str.replace()):\n", replaced_pandas)

# Using regex for replacement
# Replace any sequence of digits in 'Codes' with 'XXX'
replaced_digits = df['Codes'].str.replace(r'\d+', 'XXX', regex=True)
print("\nReplaced digits in 'Codes' using regex:\n", replaced_digits)

# f) Extracting patterns (.str.extract()) - Requires regex with capturing groups
# Extract the numeric part of the 'Codes'
extracted_numbers = df['Codes'].str.extract(r'(\d+)') # Parentheses define the capturing group
extracted_numbers.columns = ['ExtractedNumber']
print("\nExtracted numbers from 'Codes' using regex (.str.extract()):\n", extracted_numbers)

# g) String length (.str.len())
print("\nLength of strings in 'RawText' (.str.len()):\n", text_series.str.len())

# h) Concatenation (.str.cat())
concatenated_str = df['Grade'].str.cat(df['Codes'], sep=' - Code: ')
print("\nConcatenated 'Grade' and 'Codes':\n", concatenated_str)

# Many more methods available: .get(), .slice(), .findall(), .count(), etc.
print("-" * 30)


# --- 3. Categorical Data ---
# Useful for columns with a limited, fixed number of unique values.
# Saves memory and can speed up operations like groupbys.

print("--- Categorical Data ---")
print("Original 'Grade' Series:\n", df['Grade'])
print(f"Memory usage of 'Grade' (object): {df['Grade'].memory_usage(deep=True)} bytes")

# a) Converting to 'category' dtype
df['Grade_Cat'] = df['Grade'].astype('category')
print("\nConverted 'Grade' to category dtype:\n", df['Grade_Cat'])
print("\nNew dtypes:\n", df.dtypes)
print(f"Memory usage of 'Grade_Cat' (category): {df['Grade_Cat'].memory_usage(deep=True)} bytes") # Usually much lower

# b) Categorical properties (.cat accessor)
print("\nCategories (.cat.categories):\n", df['Grade_Cat'].cat.categories) # Unique values
print("\nCodes (.cat.codes):\n", df['Grade_Cat'].cat.codes) # Integer representation of categories
print("\nIs ordered? (.cat.ordered):\n", df['Grade_Cat'].cat.ordered) # Default is False

# c) Setting categories and order
# Can specify categories explicitly and set order
grade_order = ['Fair', 'Good', 'Excellent']
df['Grade_Ordered'] = df['Grade'].astype('category')
# Reorder categories and set ordered=True
df['Grade_Ordered'] = df['Grade_Ordered'].cat.set_categories(grade_order, ordered=True)

print("\nOrdered Categorical 'Grade_Ordered':\n", df['Grade_Ordered'])
print("New Categories:", df['Grade_Ordered'].cat.categories)
print("Is ordered?", df['Grade_Ordered'].cat.ordered)

# Now comparisons work based on order
print("\nComparison (Grade_Ordered > 'Fair'):\n", df['Grade_Ordered'] > 'Fair')

# d) Adding/Removing categories (.cat.add_categories, .cat.remove_categories)
df['Grade_Cat'] = df['Grade_Cat'].cat.add_categories(['Poor'])
print("\nAdded category 'Poor':\n", df['Grade_Cat'].cat.categories)

# df['Grade_Cat'] = df['Grade_Cat'].cat.remove_categories(['Fair'])
# print("\nRemoved category 'Fair':\n", df['Grade_Cat'].cat.categories)

# e) Renaming categories (.cat.rename_categories)
df['Grade_Cat'] = df['Grade_Cat'].cat.rename_categories({'Good': 'Satisfactory', 'Fair': 'Average'})
print("\nRenamed categories:\n", df['Grade_Cat'])
print("New categories:", df['Grade_Cat'].cat.categories)

# f) Groupby operations often faster with categoricals
print("\nGroupby mean 'Values' by 'Grade_Cat':\n", df.groupby('Grade_Cat')['Values'].mean(numeric_only=True))

print("-" * 30)



--- Sample DataFrame ---
   ID                            RawText  Codes      Grade  Values
0   1              Data Science is Fun!   A-101       Good    10.5
1   2             pandas makes data easy  B-203  Excellent    20.1
2   3   Regular Expressions are POWERFUL  A-105       Fair    15.6
3   4           missing data? handle it.  C-300       Good     8.0
4   5              Learn Pandas & NumPy   B-203  Excellent    20.1
5   6     Categorical Data is Efficient   A-101       Good    12.3

Initial dtypes:
 ID           int64
RawText     object
Codes       object
Grade       object
Values     float64
dtype: object
------------------------------
--- String Operations (.str) ---
Original Text Series:
 0                Data Science is Fun! 
1               pandas makes data easy
2     Regular Expressions are POWERFUL
3             missing data? handle it.
4                Learn Pandas & NumPy 
5       Categorical Data is Efficient 
Name: RawText, dtype: object

Lowercase (.str.lower()):
 0

  print("\nGroupby mean 'Values' by 'Grade_Cat':\n", df.groupby('Grade_Cat')['Values'].mean(numeric_only=True))
