### Standardizing dates

In [None]:
import pandas as pd

# Create a DataFrame with inconsistent date formats
df = pd.DataFrame({'date': ['2022-08-01', '01/08/2022', 'August 1, 2022']})

# Convert the date column to a uniform format
df['date'] = pd.to_datetime(df['date'])
print(df)


### Converting currency symbols

In [None]:
# Create DataFrame with inconsistent currency formats
df = pd.DataFrame({'price': ['$100', '€200', '£300']})

# Remove currency symbols and convert to float
df['price'] = df['price'].replace({'\$': '', '€': '', '£': ''}, regex=True).astype(float)
print(df)


### Standardize phone numbers

In [None]:
# Create DataFrame with inconsistent phone number formats
df = pd.DataFrame({'phone': ['123-456-7890', '(123) 456-7890', '123.456.7890']})

# Standardize phone numbers to a consistent format
df['phone'] = df['phone'].replace(r'[^\d]', '', regex=True)
print(df)

### Misspelled categories

In [None]:
# Create a DataFrame with misspelled categories
df = pd.DataFrame({'category': ['Appl', 'Orang', 'Bnan', 'Appl']})

# Define a mapping to correct the spellings
corrections = {'Appl': 'Apple', 'Orang': 'Orange', 'Bnan': 'Banana'}

# Apply the mapping to correct the misspellings
df['category'] = df['category'].map(corrections).fillna(df['category'])
print(df)


### Using fuzzy matching

In [None]:
from fuzzywuzzy import process

# Create DataFrame with misspelled cities
df = pd.DataFrame({'city': ['Nw York', 'Sn Francisco', 'Chcago']})

# List of correct city names
correct_names = ['New York', 'San Francisco', 'Chicago']

# Correct misspellings using fuzzy matching
df['city'] = df['city'].apply(lambda x: process.extractOne(x, correct_names)[0])
print(df)


### Misspelled product names

In [None]:
# Create DataFrame with incorrect product names
df = pd.DataFrame({'product': ['Laptoop', 'Smarttphone', 'Tablett']})

# Define specific corrections
corrections = {'Laptoop': 'Laptop', 'Smarttphone': 'Smartphone', 'Tablett': 'Tablet'}

# Replace incorrect product names with correct ones
df['product'] = df['product'].replace(corrections)
print(df)


### Removing all duplicates

In [None]:
# Create a DataFrame with duplicate records
df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [5, 6, 6, 7]})

# Remove duplicate rows
df = df.drop_duplicates()
print(df)

### Removing duplicates based on a specific column

In [None]:
# Create DataFrame with duplicate records in specific column
df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [5, 6, 6, 7]})

# Remove duplicates based on column 'A'
df = df.drop_duplicates(subset=['A'])
print(df)

### Last occurrence 

In [None]:
df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': [5, 6, 6, 7]})

# Remove duplicates but keep the last occurrence
df = df.drop_duplicates(keep='last')
print(df)ample 3: Keeping last occurrence of duplicates

### Removal of all missing values

In [None]:
import pandas as pd

# Creating data with missing values
data = {
    'Age': [25, 30, None, 22],
    'Income': [50000, 70000, 80000, None]
}
df = pd.DataFrame(data)

# Removing rows with any missing values
df_removed = df.dropna()
print(df_removed)


### Mean imputation

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer


# Creating data with missing values
data = {
    'Age': [25, 30, None, 22],
    'Income': [50000, 70000, 80000, None]
}
df = pd.DataFrame(data)

# Using mean imputation for missing values
mean_imputer = SimpleImputer(strategy="mean")
df_mean_imputed = pd.DataFrame(mean_imputer.fit_transform(df), columns=df.columns)
print(df_mean_imputed)


### Interpolation

In [None]:
import pandas as pd
import numpy as np

# Creating a sample time series data with missing values
date_rng = pd.date_range(start='2021-01-01', end='2021-01-10', freq='D')
data = {
    'date': date_rng,
    'temperature': [25, 28, np.nan, 27, 24, np.nan, 23, 22, 20, 19]
}
df = pd.DataFrame(data)

# Setting the date column as the index
df.set_index('date', inplace=True)

# Using linear interpolation to fill missing values


### Using KNN for missing data

In [None]:
from sklearn.impute import KNNImputer

# Creating data with missing values
data_with_missing = {
    'Feature1': [2, 4, None, 5],
    'Feature2': [3, 1, 2, None]
}
df_missing = pd.DataFrame(data_with_missing)

# Imputing missing values using K-Nearest Neighbors
knn_imputer = KNNImputer(n_neighbors=2)
df_knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_missing), columns=df_missing.columns)
print(df_knn_imputed)


### Removing Duplicates Based on a Specific Column

In [None]:
import pandas as pd

# Creating a DataFrame with duplicate data
data = {
    'Customer_ID': [101, 102, 103, 101, 104, 102],
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Bob'],
    'Address': ['123 Main St', '456 Pine St', '789 Oak St', '123 Main St', '321 Cedar St', '456 Pine St']
}

df = pd.DataFrame(data)

# Removing duplicates based on the 'Customer_ID' column
df_no_duplicates_by_id = df.drop_duplicates(subset=['Customer_ID'])
print("\nDataFrame after Removing Duplicates Based on 'Customer_ID':")
print(df_no_duplicates_by_id)


### Keeping the Last Occurrence of Duplicates

In [None]:
# Removing duplicates but keeping the last occurrence of each duplicate
df_keep_last = df.drop_duplicates(subset=['Customer_ID', 'Name', 'Address'], keep='last')
print("\nDataFrame after Removing Duplicates, Keeping Last Occurrence:")
print(df_keep_last)

### You want to identify outliers in a dataset containing exam scores using the Z-score.

In [None]:
from scipy.stats import zscore

# Example data
data = {'score': [50, 55, 40, 100, 45]}
df = pd.DataFrame(data)

# Calculating Z-scores
df['z_score'] = zscore(df['score'])

# Identifying outliers (e.g., |z_score| > 2)
outliers = df[np.abs(df['z_score']) > 2]


### You want to identify multivariate outliers in a dataset containing height and weight.

In [None]:
from scipy.spatial import distance
import numpy as np

# Example data
data = {'height': [170, 180, 150, 200], 'weight': [70, 80, 50, 30]}
df = pd.DataFrame(data)

# Computing Mahalanobis distance
cov_matrix = np.cov(df, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
mean = df.mean()
mahalanobis_dist = df.apply(lambda x: distance.mahalanobis(x, mean, inv_cov_matrix), axis=1)


### Removing outliers 

In [None]:
import pandas as pd

# Example data
data = {'height': [150, 160, 50, 270, 180]}
df = pd.DataFrame(data)

# Removing outliers
df = df[(df['height'] >= 100) & (df['height'] <= 250)]


### Transformation of outliers

In [None]:
# Example data
data = {'salary': [30000, 40000, 2000000, 50000]}
df = pd.DataFrame(data)

# Log transforming the 'salary' column
df['log_salary'] = np.log(df['salary'])


### Imputing outliers

In [None]:
# Example data
data = {'age': [25, 30, 120, 22, 40]}
df = pd.DataFrame(data)

# Replacing outliers with median
median_age = df['age'].median()
df['age'] = np.where((df['age'] < 18) | (df['age'] > 100), median_age, df['age'])

### Handling inconsistencies

In [None]:
import pandas as pd

# Example data
data = {'country': ['USA', 'U.S.A.', 'United States', 'UK', 'U.K.']}
df = pd.DataFrame(data)

# Standardizing the country names
df['country'] = df['country'].replace(['USA', 'U.S.A.'], 'United States').replace(['UK', 'U.K.'], 'United Kingdom')


### Converting categorical data

In [None]:
# Example data
data = {'Response': ['Yes', 'No', 'Yes', 'No']}
df = pd.DataFrame(data)

# Converting 'Yes' and 'No' to 1 and 0
df['Response'] = df['Response'].map({'Yes': 1, 'No': 0})

### Normalizing numerical features

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Example data
data = {'price': [5, 100, 20], 'quantity': [10, 1000, 100]}
df = pd.DataFrame(data)

# Applying Min-Max scaling
scaler = MinMaxScaler()
df[['price', 'quantity']] = scaler.fit_transform(df[['price', 'quantity']])


### Range checks

In [None]:
import pandas as pd

# Example data
data = {'grades': [95, 50, 150, 88, -10]}
df = pd.DataFrame(data)

# Applying range checks to flag invalid grades
df['invalid_grade'] = (df['grades'] < 0) | (df['grades'] > 100)


### Pattern matching

In [None]:
import re

# Example data
data = {'email': ['john@example.com', 'sarah.example', 'david@example']}
df = pd.DataFrame(data)

# Defining a pattern for valid email addresses
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

# Applying pattern matching to flag invalid emails
df['invalid_email'] = df['email'].apply(lambda x: not bool(re.match(pattern, x)))


### Consistency checks

In [None]:
# Example data
data = {'initial_stock': [100, 50, 200], 'sales': [50, 60, 150], 'replenished': [0, 20, 50]}
df = pd.DataFrame(data)

# Checking if sales exceed stock levels
df['inconsistent_data'] = (df['sales'] > (df['initial_stock'] + df['replenished']))