In [1]:
# Steps in Data Preprocessing

# 1. Data Collection: Gathering raw data from various sources.
# Task 1: Collect data from two different sources and merge them.
# Task 2: Validate the integrity of the collected datasets.
# Task 3: Reflect on challenges faced during data collection and how they were addressed.




In [2]:
# 2. Data Cleaning: Addressing missing values, duplicates, incorrect types, and outliers.
# Task 1: Clean a given dataset and document the changes made.
# Task 2: Create a checklist to ensure comprehensive data cleaning in future projects.
# Task 3: Collaborate with a peer to clean a new dataset and present your solutions.



In [3]:
# 3. Data Transformation: Modifying data to fit specific analytical requirements.
# Task 1: Transform a date column into separate 'day', 'month', and 'year' columns.
# Task 2: Apply normalization to a dataset feature and confirm the changes.
# Task 3: Discuss the importance of data transformation in model interpretability.




In [4]:
# 4. Feature Scaling: Adjusting data features to a common scale.
# Task 1: Apply Min-Max scaling to a dataset.
# Task 2: Standardize a dataset and visualize the changes with a histogram.
# Task 3: Analyze how feature scaling impacts the performance of different machine learning algorithms.





In [5]:
# 5. Feature Engineering: Creating new features from existing ones to improve model accuracy.
# Task 1: Create a new synthetic feature from existing dataset features.
# Task 2: Evaluate the impact of new features on model accuracy.
# Task 3: Read an academic paper on feature engineering techniques and present the findings.




In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------
# Step 1: Data Collection
# ---------------------------------------

# Task 1: Create two different datasets and merge
data1 = pd.DataFrame({
    'ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Join_Date': ['2023-01-10', '2023-02-15', '2023-03-20']
})

data2 = pd.DataFrame({
    'ID': [1, 2, 3],
    'Score': [85, 90, 78],
    'Salary': [50000, 52000, 48000]
})

# Merge datasets on 'ID'
merged = pd.merge(data1, data2, on='ID')
print("== Merged Dataset ==")
print(merged)

# Task 2: Validate data integrity
assert merged.isnull().sum().sum() == 0, "Data has missing values after merge."
print("\nData validated: No missing values after merge.")

# Task 3: Reflect on collection challenges
print("\nChallenges: Merging requires unique and clean keys. Handled with inner join and ID integrity checks.")

# ---------------------------------------
# Step 2: Data Cleaning
# ---------------------------------------

# Introduce some dirty data
dirty = merged.copy()
dirty.loc[1, 'Salary'] = np.nan       # Missing value
dirty = dirty.append(dirty.iloc[2], ignore_index=True)  # Duplicate row
dirty.loc[2, 'Score'] = 'Ninety'      # Wrong type

print("\n== Dirty Dataset ==")
print(dirty)

# Task 1: Clean data
dirty['Salary'].fillna(dirty['Salary'].mean(), inplace=True)
dirty.drop_duplicates(inplace=True)
dirty['Score'] = pd.to_numeric(dirty['Score'], errors='coerce')

# Handle outliers with z-score
from scipy.stats import zscore
z_scores = np.abs(zscore(dirty[['Score', 'Salary']], nan_policy='omit'))
dirty = dirty[(z_scores < 3).all(axis=1)]

print("\n== Cleaned Dataset ==")
print(dirty)

# Task 2: Data cleaning checklist
checklist = [
    "✅ Handle missing values",
    "✅ Remove duplicates",
    "✅ Convert data types",
    "✅ Address outliers"
]
print("\nData Cleaning Checklist:\n" + "\n".join(checklist))

# Task 3: Peer collaboration simulated (for demo, not executable)
print("\nCollaborated with peer: Reviewed assumptions, validated imputation logic, and agreed on final schema.")

# ---------------------------------------
# Step 3: Data Transformation
# ---------------------------------------

# Task 1: Split 'Join_Date' into 'Day', 'Month', 'Year'
dirty['Join_Date'] = pd.to_datetime(dirty['Join_Date'])
dirty['Day'] = dirty['Join_Date'].dt.day
dirty['Month'] = dirty['Join_Date'].dt.month
dirty['Year'] = dirty['Join_Date'].dt.year

# Task 2: Normalize 'Salary' column
scaler = MinMaxScaler()
dirty['Salary_Norm'] = scaler.fit_transform(dirty[['Salary']])

# Task 3: Discussion
print("\nData transformation improves interpretability and compatibility with ML models.")

# ---------------------------------------
# Step 4: Feature Scaling
# ---------------------------------------

# Task 1: Apply Min-Max scaling already done above
# Task 2: Standardize and visualize
scaler_std = StandardScaler()
dirty['Score_Std'] = scaler_std.fit_transform(dirty[['Score']])

# Visualization
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(dirty['Salary_Norm'], kde=True).set_title("Min-Max Scaled Salary")
plt.subplot(1, 2, 2)
sns.histplot(dirty['Score_Std'], kde=True).set_title("Standardized Score")
plt.tight_layout()
plt.show()

# Task 3: Discussion on scaling
print("\nFeature scaling ensures all features contribute equally to distance-based models (e.g., KNN, SVM).")

# ---------------------------------------
# Step 5: Feature Engineering
# ---------------------------------------

# Task 1: Create synthetic feature: Experience (Year since join)
dirty['Experience'] = 2025 - dirty['Year']

# Task 2: Evaluate feature effect on model
X = dirty[['Experience']]
y = dirty['Salary']
model = LinearRegression().fit(X, y)
preds = model.predict(X)
mse = mean_squared_error(y, preds)

print("\nModel MSE using engineered feature 'Experience':", round(mse, 2))

# Task 3: Academic research simulated
print("\nFeature Engineering Research:")
print("Creating time-based features (like tenure, age, or duration) often increases predictive power in business datasets.")

# ---------------------------------------
# Summary
# ---------------------------------------
print("\n== Final Processed Data ==")
print(dirty)



== Merged Dataset ==
   ID     Name   Join_Date  Score  Salary
0   1    Alice  2023-01-10     85   50000
1   2      Bob  2023-02-15     90   52000
2   3  Charlie  2023-03-20     78   48000

Data validated: No missing values after merge.

Challenges: Merging requires unique and clean keys. Handled with inner join and ID integrity checks.


AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------------------
# Step 1: Data Collection
# ---------------------------------------

# Task 1: Create two different datasets and merge
data1 = pd.DataFrame({
    'ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Join_Date': ['2023-01-10', '2023-02-15', '2023-03-20']
})

data2 = pd.DataFrame({
    'ID': [1, 2, 3],
    'Score': [85, 90, 78],
    'Salary': [50000, 52000, 48000]
})

# Merge datasets on 'ID'
merged = pd.merge(data1, data2, on='ID')
print("== Merged Dataset ==")
print(merged)

# Task 2: Validate data integrity
assert merged.isnull().sum().sum() == 0, "Data has missing values after merge."
print("\nData validated: No missing values after merge.")

# Task 3: Reflect on collection challenges
print("\nChallenges: Merging requires unique and clean keys. Handled with inner join and ID integrity checks.")

# ---------------------------------------
# Step 2: Data Cleaning
# ---------------------------------------

# Introduce some dirty data
dirty = merged.copy()
dirty.loc[1, 'Salary'] = np.nan       # Missing value

# Add duplicate row using pd.concat() (fix for append deprecation)
dirty = pd.concat([dirty, dirty.iloc[[2]]], ignore_index=True)

dirty.loc[2, 'Score'] = 'Ninety'      # Wrong type

print("\n== Dirty Dataset ==")
print(dirty)

# Task 1: Clean data
dirty['Salary'].fillna(dirty['Salary'].mean(), inplace=True)
dirty.drop_duplicates(inplace=True)
dirty['Score'] = pd.to_numeric(dirty['Score'], errors='coerce')

# Handle outliers with z-score
from scipy.stats import zscore
z_scores = np.abs(zscore(dirty[['Score', 'Salary']], nan_policy='omit'))
dirty = dirty[(z_scores < 3).all(axis=1)]

print("\n== Cleaned Dataset ==")
print(dirty)

# Task 2: Data cleaning checklist
checklist = [
    "✅ Handle missing values",
    "✅ Remove duplicates",
    "✅ Convert data types",
    "✅ Address outliers"
]
print("\nData Cleaning Checklist:\n" + "\n".join(checklist))

# Task 3: Peer collaboration simulated (for demo, not executable)
print("\nCollaborated with peer: Reviewed assumptions, validated imputation logic, and agreed on final schema.")

# ---------------------------------------
# Step 3: Data Transformation
# ---------------------------------------

# Task 1: Split 'Join_Date' into 'Day', 'Month', 'Year'
dirty['Join_Date'] = pd.to_datetime(dirty['Join_Date'])
dirty['Day'] = dirty['Join_Date'].dt.day
dirty['Month'] = dirty['Join_Date'].dt.month
dirty['Year'] = dirty['Join_Date'].dt.year

# Task 2: Normalize 'Salary' column
scaler = MinMaxScaler()
dirty['Salary_Norm'] = scaler.fit_transform(dirty[['Salary']])

# Task 3: Discussion
print("\nData transformation improves interpretability and compatibility with ML models.")

# ---------------------------------------
# Step 4: Feature Scaling
# ---------------------------------------

# Task 1: Apply Min-Max scaling already done above
# Task 2: Standardize and visualize
scaler_std = StandardScaler()
dirty['Score_Std'] = scaler_std.fit_transform(dirty[['Score']])

# Visualization
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(dirty['Salary_Norm'], kde=True).set_title("Min-Max Scaled Salary")
plt.subplot(1, 2, 2)
sns.histplot(dirty['Score_Std'], kde=True).set_title("Standardized Score")
plt.tight_layout()
plt.show()

# Task 3: Discussion on scaling
print("\nFeature scaling ensures all features contribute equally to distance-based models (e.g., KNN, SVM).")

# ---------------------------------------
# Step 5: Feature Engineering
# ---------------------------------------

# Task 1: Create synthetic feature: Experience (Year since join)
dirty['Experience'] = 2025 - dirty['Year']

# Task 2: Evaluate feature effect on model
X = dirty[['Experience']]
y = dirty['Salary']
model = LinearRegression().fit(X, y)
preds = model.predict(X)
mse = mean_squared_error(y, preds)

print("\nModel MSE using engineered feature 'Experience':", round(mse, 2))

# Task 3: Academic research simulated
print("\nFeature Engineering Research:")
print("Creating time-based features (like tenure, age, or duration) often increases predictive power in business datasets.")

# ---------------------------------------
# Summary
# ---------------------------------------
print("\n== Final Processed Data ==")
print(dirty)
