In [None]:
# Importance of Data Cleaning

# 1. Missing Values: Missing data points in a dataset can lead to biased results.
#     Task 1: Load a dataset and identify which columns have missing values.
#     Task 2: Replace missing values in a dataset with the column mean or mode.
#     Task 3: Compare model performance with and without handling missing values.
    





In [None]:
# 2. Duplicate Data: Repeated data points can skew analysis and model results.
#     Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
#     Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
#     Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
    
    
    

In [None]:
# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.
#     Task 1: Convert a column of string numbers to integers in a dataset.
#     Task 2: Identify and correct columns with inconsistent data types in a dataset.
#     Task 3: Discuss why correct data types are critical for feature engineering.
    
    
    

In [None]:
# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.
#     Task 1: Visualize a dataset and identify outliers using a boxplot.
#     Task 2: Remove or adjust outliers and re-analyze the dataset.
#     Task 3: Research and report on a technique for handling outliers effectively.
    
    
    

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Sample dataset with issues
data = {
    'Age': [25, 27, np.nan, 45, 33, 27, 27, 1000, 29, 33, 27],
    'Income': [50000, 54000, 58000, 62000, np.nan, 54000, 54000, 1000000, 56000, 59000, 54000],
    'Gender': ['M', 'F', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 'F'],
    'Experience': [1, 3, 5, 7, 9, np.nan, 3, 12, 6, 4, 3]
}

df = pd.DataFrame(data)

print("Original Dataset Shape:", df.shape)

# ----------- 1. Missing Values --------------
print("\n--- Task 1: Identify Missing Values ---")
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])

print("\n--- Task 2: Fill Missing Values ---")
# Fill numerical columns with mean
for col in ['Age', 'Income', 'Experience']:
    if df[col].isnull().any():
        mean_val = df[col].mean()
        df[col].fillna(mean_val, inplace=True)
        print(f"Filled missing values in {col} with mean: {mean_val}")

# --------- Model comparison ---------
print("\n--- Task 3: Compare model performance with/without missing value handling ---")

# For demonstration, simulate original dataset with missing values for comparison
df_missing = pd.DataFrame(data)

# Prepare data for modeling (simple linear regression predicting Income from Age & Experience)
def prepare_and_evaluate(dataframe, desc):
    X = dataframe[['Age', 'Experience']]
    y = dataframe['Income']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    print(f"{desc} - MSE: {mse:.2f}")

prepare_and_evaluate(df_missing.dropna(), "Without Handling Missing Values (drop NA)")
prepare_and_evaluate(df, "With Missing Values Filled (mean imputation)")

# ----------- 2. Duplicate Data --------------
print("\n--- Task 1: Identify & Remove Duplicates ---")
print("Initial dataset shape:", df.shape)
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
df = df.drop_duplicates()
print("Dataset shape after removing duplicates:", df.shape)

print("\n--- Task 3: Impact of duplicates on prediction accuracy ---")
print("Duplicates can lead to overfitting and biased model results because repeated data points skew the distribution.")

# ----------- 3. Incorrect Data Types --------------
print("\n--- Task 1 & 2: Fixing Incorrect Data Types ---")

# Add an example column with incorrect types
df['Years_at_Company'] = ['5', '3', '4', '6', '2', '4', '3', '10', '4', '3']

print("Before conversion:", df['Years_at_Company'].dtype)
df['Years_at_Company'] = pd.to_numeric(df['Years_at_Company'], errors='coerce')
print("After conversion:", df['Years_at_Company'].dtype)

print("\n--- Task 3: Importance of Correct Data Types ---")
print("Correct data types enable proper mathematical operations, memory efficiency, and compatibility with ML algorithms.")

# ----------- 4. Outliers & Inconsistencies --------------
print("\n--- Task 1: Visualize Outliers using Boxplot ---")
sns.boxplot(x=df['Age'])
plt.title('Boxplot of Age')
plt.show()

print("\n--- Task 2: Remove Outliers ---")
# Define outlier threshold (e.g., Age > 100 as outlier)
outlier_threshold = 100
df_no_outliers = df[df['Age'] < outlier_threshold]
print(f"Removed {df.shape[0] - df_no_outliers.shape[0]} outliers from Age.")

print("\n--- Task 3: Report on Outlier Handling Technique ---")
print("""A common technique to handle outliers is the IQR method:
- Calculate Q1 (25th percentile) and Q3 (75th percentile).
- Compute IQR = Q3 - Q1.
- Define outliers as points outside [Q1 - 1.5*IQR, Q3 + 1.5*IQR].
This method helps identify extreme values while retaining legitimate data.""")



Original Dataset Shape: (11, 4)

--- Task 1: Identify Missing Values ---
Age           1
Income        1
Experience    1
dtype: int64

--- Task 2: Fill Missing Values ---
Filled missing values in Age with mean: 127.3
Filled missing values in Income with mean: 150100.0
Filled missing values in Experience with mean: 5.3

--- Task 3: Compare model performance with/without missing value handling ---
Without Handling Missing Values (drop NA) - MSE: 19898776.38
With Missing Values Filled (mean imputation) - MSE: 830302623.08

--- Task 1: Identify & Remove Duplicates ---
Initial dataset shape: (11, 4)
Number of duplicate rows: 2
Dataset shape after removing duplicates: (9, 4)

--- Task 3: Impact of duplicates on prediction accuracy ---
Duplicates can lead to overfitting and biased model results because repeated data points skew the distribution.

--- Task 1 & 2: Fixing Incorrect Data Types ---


ValueError: Length of values (10) does not match length of index (9)