In [7]:
import pandas as pd

# Sample dataset with duplicates
data = {
    'CustomerID': [101, 102, 103, 101, 104, 105, 103],
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Eva', 'Charlie'],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'alice@example.com', 'david@example.com', 'eva@example.com', 'charlie@example.com'],
    'PurchaseAmount': [250, 150, 200, 250, 300, 450, 200]
}

df = pd.DataFrame(data)

print("Original dataset shape:", df.shape)
print("Original dataset:")
print(df)

# Task 1: Identify duplicate entries
duplicates = df.duplicated()
print("\nDuplicate rows identified (True means duplicate):")
print(duplicates)

# Task 2: Remove duplicate entries
df_cleaned = df.drop_duplicates()

print("\nDataset shape after removing duplicates:", df_cleaned.shape)
print("Dataset after removing duplicates:")
print(df_cleaned)

# Optional: Count duplicates
num_duplicates = duplicates.sum()
print(f"\nNumber of duplicate rows removed: {num_duplicates}")

# Task 3: Explanation for classmate
explanation = """
Duplicate data means the same record appears multiple times.
This can skew the analysis by giving more weight to repeated records,
leading models to overfit or bias predictions toward duplicated instances.
Removing duplicates ensures the model learns from unique data points,
improving prediction accuracy and generalization.
"""
print(explanation)

Original dataset shape: (7, 4)
Original dataset:
   CustomerID     Name                Email  PurchaseAmount
0         101    Alice    alice@example.com             250
1         102      Bob      bob@example.com             150
2         103  Charlie  charlie@example.com             200
3         101    Alice    alice@example.com             250
4         104    David    david@example.com             300
5         105      Eva      eva@example.com             450
6         103  Charlie  charlie@example.com             200

Duplicate rows identified (True means duplicate):
0    False
1    False
2    False
3     True
4    False
5    False
6     True
dtype: bool

Dataset shape after removing duplicates: (5, 4)
Dataset after removing duplicates:
   CustomerID     Name                Email  PurchaseAmount
0         101    Alice    alice@example.com             250
1         102      Bob      bob@example.com             150
2         103  Charlie  charlie@example.com             200
4      

In [10]:
import pandas as pd

# Sample data with inconsistent types
data = {
    'ID': ['1', '2', '3', '4'],
    'Age': ['25', '30', '35', 'forty'],  # 'forty' is a non-numeric string
    'Salary': ['50000', '60000', None, '70000'],
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\nData types before conversion:")
print(df.dtypes)

# Task 1: Convert 'ID' column to integers
df['ID'] = pd.to_numeric(df['ID'], errors='coerce')

# Task 2: Identify columns with inconsistent data types and convert where possible
# Convert 'Age' and 'Salary' to numeric, coercing errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

print("\nDataFrame after type conversion:")
print(df)
print("\nData types after conversion:")
print(df.dtypes)

# Task 3: Discussion (as a comment)
# Correct data types are critical for feature engineering because:
# - Numerical operations require numeric types.
# - ML algorithms need consistent input types.
# - Incorrect types can cause errors or misleading analysis.

Original DataFrame:
  ID    Age Salary
0  1     25  50000
1  2     30  60000
2  3     35   None
3  4  forty  70000

Data types before conversion:
ID        object
Age       object
Salary    object
dtype: object

DataFrame after type conversion:
   ID   Age   Salary
0   1  25.0  50000.0
1   2  30.0  60000.0
2   3  35.0      NaN
3   4   NaN  70000.0

Data types after conversion:
ID          int64
Age       float64
Salary    float64
dtype: object


In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sample dataset with outliers
data = {
    'Age': [25, 30, 28, 22, 35, 150, 29, 27, 26, 28],  # 150 is an outlier
    'Salary': [50000, 60000, 55000, 48000, 62000, 1000000, 58000, 59000, 61000, 57000]  # 1,000,000 is an outlier
}

df = pd.DataFrame(data)

# Task 1: Visualize outliers using boxplots
plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
sns.boxplot(y=df['Age'])
plt.title('Boxplot of Age')

plt.subplot(1, 2, 2)
sns.boxplot(y=df['Salary'])
plt.title('Boxplot of Salary')

plt.show()

# Task 2: Remove or adjust outliers
# Using the IQR method to identify outliers
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return filtered_df

# Remove outliers from Age and Salary columns separately and then combine
df_no_outliers_age = remove_outliers_iqr(df, 'Age')
df_no_outliers_salary = remove_outliers_iqr(df, 'Salary')

# Intersection of both filters to remove all outliers in either column
df_no_outliers = df.loc[df.index.isin(df_no_outliers_age.index) & df.index.isin(df_no_outliers_salary.index)]

print("\nOriginal dataset shape:", df.shape)
print("Dataset shape after outlier removal:", df_no_outliers.shape)

# Visualize again after removing outliers
plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
sns.boxplot(y=df_no_outliers['Age'])
plt.title('Age without Outliers')

plt.subplot(1, 2, 2)
sns.boxplot(y=df_no_outliers['Salary'])
plt.title('Salary without Outliers')

plt.show()

# Task 3: Research report on outlier handling techniques:
"""
Common techniques for handling outliers include:

1. Removal: Simply remove the outlier data points if they are errors or irrelevant.
2. Transformation: Apply transformations like log, square root to reduce skewness.
3. Capping/Flooring (Winsorizing): Replace outliers with nearest acceptable values.
4. Imputation: Replace outliers with mean, median, or predicted values.
5. Robust models: Use algorithms that are less sensitive to outliers (e.g., tree-based models).

Choice depends on the context and domain knowledge.

SyntaxError: incomplete input (1903437107.py, line 62)