 # Splitting DateTime into Components

In [307]:
# Sample data with datetime
df = pd.DataFrame({
    "datetime_col": pd.to_datetime(["2025-01-01 10:00:00", "2025-01-02 12:00:00"])
})

df

Unnamed: 0,datetime_col
0,2025-01-01 10:00:00
1,2025-01-02 12:00:00


In [309]:
# Extract components
df["year"] = df["datetime_col"].dt.year
df["month"] = df["datetime_col"].dt.month
df["day"] = df["datetime_col"].dt.day
df["hour"] = df["datetime_col"].dt.hour
df["minute"] = df["datetime_col"].dt.minute

df

Unnamed: 0,datetime_col,year,month,day,hour,minute
0,2025-01-01 10:00:00,2025,1,1,10,0
1,2025-01-02 12:00:00,2025,1,2,12,0


# Handling Inconsistent Time Zones

In [3]:
import pandas as pd
# data 
df = pd.DataFrame({
    "datetime_col": ["2025-01-01 10:00:00+05:30", "2025-01-02 12:00:00+05:30"]
})
df


Unnamed: 0,datetime_col
0,2025-01-01 10:00:00+05:30
1,2025-01-02 12:00:00+05:30


In [5]:
df["datetime_col"] = pd.to_datetime(df["datetime_col"]).dt.tz_convert('UTC')

print(df)

               datetime_col
0 2025-01-01 04:30:00+00:00
1 2025-01-02 06:30:00+00:00


# Handling Multi-Index DataFrames

In [19]:
# Sample multi-index DataFrame
df = pd.DataFrame({
    "col1": [1, 2, 3],
    "col2": [4, 5, 6]
})

df


Unnamed: 0,col1,col2
0,1,4
1,2,5
2,3,6


In [21]:
df.set_index(["col1", "col2"], inplace=True)
df

col1,col2
1,4
2,5
3,6


In [25]:
# Resetting the index
df.reset_index(inplace=True)
df

Unnamed: 0,index,col1,col2
0,0,1,4
1,1,2,5
2,2,3,6


# Splitting Columns (e.g., Names)

In [125]:
# Sample data
df = pd.DataFrame({
    "full_name": ["yettina shravani", "yettina sony","sravanthi yettina"]
})
df

Unnamed: 0,full_name
0,yettina shravani
1,yettina sony
2,sravanthi yettina


In [127]:
df[["first_name", "last_name"]] = df["full_name"].str.split(" ", expand=True)

df


Unnamed: 0,full_name,first_name,last_name
0,yettina shravani,yettina,shravani
1,yettina sony,yettina,sony
2,sravanthi yettina,sravanthi,yettina


 # Merging DataFrames


In [135]:
# Sample data
df1 = pd.DataFrame({"id": [1, 2, 3], "name": ["shravani", "shravani", "Alice"]})
df2 = pd.DataFrame({"id": [1, 2, 3], "age": [25, 30, 35]})

df1


Unnamed: 0,id,name
0,1,John
1,2,Jane
2,3,Alice


In [137]:
df2

Unnamed: 0,id,age
0,1,25
1,2,30
2,3,35


In [141]:
# Merging the dataframes on 'id'
df_merged = pd.merge(df1, df2, on="id", how="left")
df_merged


Unnamed: 0,id,name,age
0,1,John,25
1,2,Jane,30
2,3,Alice,35


 # Handling Missing Categorical Data (Mode Imputation)

In [145]:
# Sample data with missing values
df = pd.DataFrame({
    "col": ["A", "B", None, "A"]
})
df

Unnamed: 0,col
0,A
1,B
2,
3,A


In [149]:
# Fill missing values with the mode
df["col"].fillna(df["col"].mode()[0], inplace=True)
df

Unnamed: 0,col
0,A
1,B
2,A
3,A


#  Replacing Negative Values with NaN

In [180]:
# Sample data with negative values
df = pd.DataFrame({
    "col": [10, -5, 3, -2]
})

df


Unnamed: 0,col
0,10
1,-5
2,3
3,-2


In [182]:
import numpy as np

In [184]:
# Replace negative values with NaN
df["col"] = df["col"].apply(lambda x:"nan" if x < 0 else x)
df


Unnamed: 0,col
0,10.0
1,
2,3.0
3,


# Standardizing Text Columns (Title Case, Remove Special Characters)


In [192]:
# Sample data
df = pd.DataFrame({
    "col": ["hello world!", "python is great", "cleaning data..."]
})

df


Unnamed: 0,col
0,hello world!
1,python is great
2,cleaning data...


In [194]:
# Convert to title case and remove special characters
df["col"] = df["col"].str.title().str.replace(r"[^\w\s]", "")
df

Unnamed: 0,col
0,Hello World!
1,Python Is Great
2,Cleaning Data...


#  Dropping Columns with Many Missing Values

In [219]:
import pandas as pd

# Sample data with missing values
df = pd.DataFrame({
    "col1": [1, 2, None, 4],
    "col2": [None, None, None, None],
    "col3": [5, 6, 7, 8]
})

df

Unnamed: 0,col1,col2,col3
0,1.0,,5
1,2.0,,6
2,,,7
3,4.0,,8


In [221]:
# Drop columns with more than 50% missing values
threshold = 0.5
df = df.loc[:, df.isnull().mean() < threshold]  # Apply condition on columns
df


Unnamed: 0,col1,col3
0,1.0,5
1,2.0,6
2,,7
3,4.0,8


#  Removing Leading Zeros from Numeric Columns

In [228]:
# Sample data with leading zeros
df = pd.DataFrame({
    "col": ["001", "002", "003"]
})


df

Unnamed: 0,col
0,1
1,2
2,3


In [230]:
# Remove leading zeros
df["col"] = df["col"].str.lstrip("0")
df


Unnamed: 0,col
0,1
1,2
2,3


# Handling Custom Categorical Groups

In [233]:
# Sample data with categories
df = pd.DataFrame({
    "size": ["S", "M", "L", "M"]
})

df


Unnamed: 0,size
0,S
1,M
2,L
3,M


In [237]:
# Group sizes into new categories
df["size_group"] = df["size"].replace({"S": "small", "M": "medium", "L": "large"})
df


Unnamed: 0,size,size_group
0,S,small
1,M,medium
2,L,large
3,M,medium


# Fixing Data Type Based on Values

In [244]:
# Sample data
df = pd.DataFrame({
    "col": [-1, 2, -3, 4]
})

df

Unnamed: 0,col
0,-1
1,2
2,-3
3,4


In [246]:
# Fix values by applying condition
df["col"] = df["col"].apply(lambda x: "negative" if x < 0 else "positive")
df

Unnamed: 0,col
0,negative
1,positive
2,negative
3,positive


#  Dealing with Special Characters in Column Names



In [262]:
# Sample data with special characters in column names
df = pd.DataFrame({
    "first@name": [1, 2, 3],
    "last&name": [4, 5, 6]
})
df


Unnamed: 0,first@name,last&name
0,1,4
1,2,5
2,3,6


In [264]:
# Clean column names
df.columns = df.columns.str.replace(r"[^\w\s]", "_")
df.columns = df.columns.str.strip()
df

Unnamed: 0,first@name,last&name
0,1,4
1,2,5
2,3,6


In [268]:
import pandas as pd

# Sample data with special characters in column names
df = pd.DataFrame({
    "first@name": [1, 2, 3],
    "last&name": [4, 5, 6],
    "email#address": [7, 8, 9]
})

df


Unnamed: 0,first@name,last&name,email#address
0,1,4,7
1,2,5,8
2,3,6,9


In [270]:
# Clean column names by replacing special characters with underscores
df.columns = df.columns.str.replace(r"[^\w\s]", "_", regex=True)
df

Unnamed: 0,first_name,last_name,email_address
0,1,4,7
1,2,5,8
2,3,6,9


 # Checking for Valid Email Addresses

In [282]:
# Sample data with emails
df = pd.DataFrame({
    "email": ["john.doe@example.com", "invalid-email", "jane@domain.com"]
})

df

Unnamed: 0,email
0,john.doe@example.com
1,invalid-email
2,jane@domain.com


In [284]:
# Check for valid email format
df["email_valid"] = df["email"].str.contains(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", regex=True)
df


Unnamed: 0,email,email_valid
0,john.doe@example.com,True
1,invalid-email,False
2,jane@domain.com,True


# Standardizing Text Columns (Lowercase, Strip)

In [288]:
# Sample data with inconsistent text
df = pd.DataFrame({
    "col": [" YES ", " no ", " Yes "]
})
df


Unnamed: 0,col
0,YES
1,no
2,Yes


In [292]:
# Standardize text by stripping spaces and converting to lowercase
df["col"] = df["col"].str.lower().str.strip()
df


Unnamed: 0,col
0,yes
1,no
2,yes


#  Removing Stop Words in Text Columns

In [302]:
from nltk.corpus import stopwords

# Sample data
df = pd.DataFrame({
    "text": ["This is a test", "Stop words should be removed"]
})

df


Unnamed: 0,text
0,This is a test
1,Stop words should be removed


In [304]:
# Remove stop words
stop_words = set(stopwords.words("english"))
df["cleaned_text"] = df["text"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
df


Unnamed: 0,text,cleaned_text
0,This is a test,This test
1,Stop words should be removed,Stop words removed
