In [None]:
#Standardizing Text

df['Sex'] = df['Sex'].str.strip() # Remove leading and trailing spaces

df['name'] = df['name'].str.lower()# Convert to lowercase 
df['category'] = df['category'].str.upper()

df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) # Remove special characters (e.g., !, @, .)


In [None]:
# Rename columns
df = df.rename(columns={'old_name': 'new_name'})

# Drop irrelevant columns
df = df.drop(columns=['id_column', 'constant_column'])

In [None]:
# Extract features (e.g., year, month, day)
df['year'] = df['date_column'].dt.year
df['month'] = df['date_column'].dt.month
df['day'] = df['date_column'].dt.day

In [None]:
# Filter rows based on a condition
df = df[df['value'] > 200]

In [None]:
# Duplicates

import pandas as pd

data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David', 'Eve', 'Frank', 'Grace', 'Alice'],
    'date': ['2023-01-01', '2023-02-15', '2023-03-10', '2023-01-01', '2023-04-20', '2023-05-05', '2023-06-01', '2023-07-15', '2023-08-20', '2023-01-01'],
    'value': [100, 200, 150, 100, 200, 400, 250, 350, 450, 100]
}


df = pd.DataFrame(data)  # Create DataFrame
print("Original DataFrame:")
print(df)


duplicates = df[df.duplicated(keep=False)]  # Identify duplicate rows based on all columns `keep=False` marks all duplicates as True
num_duplicates = len(duplicates) # Count the number of duplicate rows
print(f"\nNumber of duplicate rows: {num_duplicates}")
print("Duplicate rows:")
print(duplicates)

df_cleaned = df.drop_duplicates()


#You can specify a subset of columns to check for duplicates 
duplicates = df[df.duplicated(subset=['value','name'],keep=False)]
num_duplicates = len(duplicates)
print(f"\nNumber of duplicate rows: {num_duplicates}")
print("Duplicate rows:")
print(duplicates)


Original DataFrame:
      name        date  value
0    Alice  2023-01-01    100
1      Bob  2023-02-15    200
2  Charlie  2023-03-10    150
3    Alice  2023-01-01    100
4      Bob  2023-04-20    200
5    David  2023-05-05    400
6      Eve  2023-06-01    250
7    Frank  2023-07-15    350
8    Grace  2023-08-20    450
9    Alice  2023-01-01    100

Number of duplicate rows: 3
Duplicate rows:
    name        date  value
0  Alice  2023-01-01    100
3  Alice  2023-01-01    100
9  Alice  2023-01-01    100

Number of duplicate rows: 5
Duplicate rows:
    name        date  value
0  Alice  2023-01-01    100
1    Bob  2023-02-15    200
3  Alice  2023-01-01    100
4    Bob  2023-04-20    200
9  Alice  2023-01-01    100


In [None]:
# Transform type of columns

print(df.dtypes)
df['date'] = pd.to_datetime(df['date']) #String to datetime

df['value'] = df['value'].astype(float)

df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')

print(df.dtypes)

name             object
date     datetime64[ns]
value           float64
dtype: object
name             object
date     datetime64[ns]
value           float64
dtype: object


In [None]:
# Mising values

import pandas as pd
import numpy as np

data = {
    'id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,np.nan,11],
    'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David', 'Eve', 'Frank', 'Grace', 'Robert',np.nan,'Nicole'],
    'age': [25, 30, 35, np.nan, 40, 45, 50, 55, 60, 65,32,59],
    'gender': ['F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F', np.nan,'M','M'],
    'income': [1000, 2000, 1500, 1000, 3000, np.nan, 2500, 3500, 4500, 5000,200,15000]
}

df = pd.DataFrame(data)
print(df)

# Verify missing values are removed
print(df.isnull().sum())


# Drop rows where any categorical column has missing values
categorical_cols = ['gender']
df = df.dropna(subset=categorical_cols)
#print(df.isnull().sum())


# Fill missing values with a specific value (e.g., 0)
df['income'] = df['income'].fillna(0)
df['gender'] = df['gender'].fillna('undefined')

# Fill missing values with the mean (for numerical columns)
df['column_name'] = df['column_name'].fillna(df['column_name'].mean())

# Drop rows with missing values
df = df.dropna()
print(df.isnull().sum())
print(df)



      id     name   age gender   income
0    1.0    Alice  25.0      F   1000.0
1    2.0      Bob  30.0      M   2000.0
2    3.0  Charlie  35.0      M   1500.0
3    4.0    Alice   NaN      F   1000.0
4    5.0      Bob  40.0      M   3000.0
5    6.0    David  45.0      M      NaN
6    7.0      Eve  50.0      F   2500.0
7    8.0    Frank  55.0      M   3500.0
8    9.0    Grace  60.0      F   4500.0
9   10.0   Robert  65.0    NaN   5000.0
10   NaN      NaN  32.0      M    200.0
11  11.0   Nicole  59.0      M  15000.0
id        1
name      1
age       1
gender    1
income    1
dtype: int64
id        0
name      0
age       0
gender    0
income    0
dtype: int64
      id     name   age gender   income
0    1.0    Alice  25.0      F   1000.0
1    2.0      Bob  30.0      M   2000.0
2    3.0  Charlie  35.0      M   1500.0
4    5.0      Bob  40.0      M   3000.0
5    6.0    David  45.0      M      0.0
6    7.0      Eve  50.0      F   2500.0
7    8.0    Frank  55.0      M   3500.0
8    9.0    Gr