In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# --- Task 1: Load a dataset and identify columns with missing values ---
def identify_missing_values(df):
    """
    Loads a dataset and identifies columns with missing values.

    Args:
        df: pandas DataFrame.

    Returns:
        A pandas Series indicating the count of missing values per column.
    """
    missing_counts = df.isnull().sum()
    missing_cols = missing_counts[missing_counts > 0]
    print("Columns with missing values:")
    print(missing_cols)
    return missing_cols

# Example for Task 1:
data_missing = {'col1': [1, 2, np.nan, 4, 5],
                'col2': ['A', np.nan, 'B', 'A', 'C'],
                'col3': [10.0, 20.0, 30.0, np.nan, 50.0]}
df_missing = pd.DataFrame(data_missing)
identify_missing_values(df_missing)
print("-" * 30)

# --- Task 2: Replace missing values with mean or mode ---
def fill_missing_values(df):
    """
    Replaces missing values in a dataset with the column mean for numeric columns
    and the column mode for categorical columns.

    Args:
        df: pandas DataFrame.

    Returns:
        A pandas DataFrame with missing values filled.
    """
    df_filled = df.copy()
    for col in df_filled.columns:
        if df_filled[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df_filled[col]):
                mean_val = df_filled[col].mean()
                df_filled[col].fillna(mean_val, inplace=True)
            else:
                mode_val = df_filled[col].mode()[0]
                df_filled[col].fillna(mode_val, inplace=True)
    print("DataFrame with missing values filled:")
    print(df_filled)
    return df_filled

# Example for Task 2:
df_filled = fill_missing_values(df_missing.copy())
print("-" * 30)

# --- Task 3: Compare model performance with and without handling missing values ---
def compare_model_performance(df_original, df_filled, target_column):
    """
    Compares the performance of a linear regression model trained on a dataset
    with and without handling missing values.

    Args:
        df_original: pandas DataFrame with missing values.
        df_filled: pandas DataFrame with missing values filled.
        target_column: Name of the target variable column.

    Returns:
        None. Prints the Mean Squared Error for both scenarios.
    """
    # Separate features and target
    if target_column not in df_original.columns:
        print(f"Error: Target column '{target_column}' not found.")
        return

    X_original = df_original.drop(target_column, axis=1)
    y_original = df_original[target_column]

    X_filled = df_filled.drop(target_column, axis=1)
    y_filled = df_filled[target_column]

    # Handle categorical features (simple label encoding for this example)
    for col in X_original.columns:
        if not pd.api.types.is_numeric_dtype(X_original[col]):
            le = LabelEncoder()
            X_original[col] = le.fit_transform(X_original[col])
            X_filled[col] = le.transform(X_filled[col]) # Use the same encoder

    # Split data
    X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_original, y_original, test_size=0.3, random_state=42)
    X_train_filled, X_test_filled, y_train_filled, y_test_filled = train_test_split(X_filled, y_filled, test_size=0.3, random_state=42)

    # Train and evaluate model on original data (with missing values)
    model_original = LinearRegression()
    try:
        model_original.fit(X_train_orig, y_train_orig)
        predictions_original = model_original.predict(X_test_orig)
        mse_original = mean_squared_error(y_test_orig, predictions_original)
        print(f"\nModel Performance (Original Data - Missing Values):")
        print(f"Mean Squared Error: {mse_original:.2f}")
    except ValueError as e:
        print(f"\nError training model on original data: {e}")
        print("Consider handling missing values before training.")

    # Train and evaluate model on filled data
    model_filled = LinearRegression()
    model_filled.fit(X_train_filled, y_train_filled)
    predictions_filled = model_filled.predict(X_test_filled)
    mse_filled = mean_squared_error(y_test_filled, predictions_filled)
    print(f"\nModel Performance (Data with Missing Values Filled):")
    print(f"Mean Squared Error: {mse_filled:.2f}")

# Example for Task 3:
data_for_model = {'feature1': [1, 2, np.nan, 4, 5, 6, 7, np.nan, 9, 10],
                  'feature2': ['A', 'B', 'A', 'C', 'B', 'A', 'B', 'A', 'C', 'B'],
                  'target': [10, 20, 30, np.nan, 50, 60, 70, 80, 90, 100]}

Columns with missing values:
col1    1
col2    1
col3    1
dtype: int64
------------------------------
DataFrame with missing values filled:
   col1 col2  col3
0   1.0    A  10.0
1   2.0    A  20.0
2   3.0    B  30.0
3   4.0    A  27.5
4   5.0    C  50.0
------------------------------


In [1]:
import pandas as pd

# --- Task 1: Identify and remove duplicate entries ---
def identify_and_remove_duplicates(df):
    """
    Identifies and removes duplicate entries from a pandas DataFrame.

    Args:
        df: pandas DataFrame.

    Returns:
        A pandas DataFrame with duplicate rows removed.
    """
    print("Identifying duplicate rows...")
    duplicate_rows = df[df.duplicated()]
    print("Number of duplicate rows found:", len(duplicate_rows))

    print("\nRemoving duplicate rows...")
    df_no_duplicates = df.drop_duplicates()
    print("Duplicate rows removed.")
    return df_no_duplicates

# Example for Task 1:
data_duplicates = {'col1': [1, 2, 2, 3, 4, 4, 4, 5],
                   'col2': ['A', 'B', 'B', 'C', 'D', 'D', 'D', 'E'],
                   'col3': [10, 20, 20, 30, 40, 40, 40, 50]}
df_with_duplicates = pd.DataFrame(data_duplicates)
df_without_duplicates = identify_and_remove_duplicates(df_with_duplicates.copy())
print("-" * 30)

# --- Task 2: Document the before-and-after dataset shape ---
def document_dataset_shape(df_before, df_after):
    """
    Documents the shape of a pandas DataFrame before and after removing duplicates.

    Args:
        df_before: pandas DataFrame before duplicate removal.
        df_after: pandas DataFrame after duplicate removal.
    """
    print("Dataset shape before removing duplicates:", df_before.shape)
    print("Dataset shape after removing duplicates:", df_after.shape)

# Example for Task 2:
document_dataset_shape(df_with_duplicates, df_without_duplicates)
print("-" * 30)

# --- Task 3: Explain to a classmate how duplicate data can affect prediction accuracy ---
def explain_duplicate_data_impact():
    """
    Provides an explanation of how duplicate data can affect prediction accuracy.
    """
    explanation = """
    Hey classmate, let's talk about how duplicate data can mess up our predictions in machine learning.

    Imagine you're training a model to predict whether a customer will buy a product. If you have many identical entries for the same customer (same features, same outcome), the model might:

    1. **Overemphasize certain data points:** The model will see these repeated data points as more important than they actually are in the overall population. It will essentially give these duplicates more "weight" during training.

    2. **Lead to biased learning:** If the duplicates happen to be clustered around a specific outcome, the model might learn a skewed relationship that isn't representative of the true patterns in the data. For example, if you have many duplicate entries of customers who didn't buy the product, the model might become overly confident in predicting that no one will buy it.

    3. **Inflate performance metrics on the training data (but perform poorly on unseen data):** The model might achieve high accuracy on the training set simply because it has seen the same data points multiple times. However, when it encounters new, unseen data, its performance could be significantly worse because it hasn't learned a generalizable pattern. This is a form of overfitting.

    4. **Waste computational resources:** Training on a dataset with many duplicates can take longer and consume more memory without providing any additional valuable information to the model.

    In short, duplicate data can prevent our models from learning the true underlying relationships in the data and can lead to inaccurate predictions on new data. That's why it's crucial to identify and remove duplicates during the data cleaning process!
    """
    print(explanation)

# Example for Task 3:
explain_duplicate_data_impact()

Identifying duplicate rows...
Number of duplicate rows found: 3

Removing duplicate rows...
Duplicate rows removed.
------------------------------
Dataset shape before removing duplicates: (8, 3)
Dataset shape after removing duplicates: (5, 3)
------------------------------

    Hey classmate, let's talk about how duplicate data can mess up our predictions in machine learning.

    Imagine you're training a model to predict whether a customer will buy a product. If you have many identical entries for the same customer (same features, same outcome), the model might:

    1. **Overemphasize certain data points:** The model will see these repeated data points as more important than they actually are in the overall population. It will essentially give these duplicates more "weight" during training.

    2. **Lead to biased learning:** If the duplicates happen to be clustered around a specific outcome, the model might learn a skewed relationship that isn't representative of the true pattern

In [4]:
import pandas as pd

# --- Task 1: Convert a column of string numbers to integers ---
def convert_string_to_int(df, column_name):
    """
    Converts a column of string numbers to integers in a pandas DataFrame.

    Args:
        df: pandas DataFrame.
        column_name: Name of the column to convert.

    Returns:
        A pandas DataFrame with the specified column converted to integer type.
        Returns the original DataFrame if the column is not found or already numeric.
    """
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found.")
        return df

    if pd.api.types.is_numeric_dtype(df[column_name]):
        print(f"Column '{column_name}' is already numeric.")
        return df

    try:
        df[column_name] = pd.to_numeric(df[column_name], errors='raise', downcast='integer')
        print(f"Column '{column_name}' successfully converted to integer.")
    except ValueError as e:
        print(f"Error converting column '{column_name}' to integer: {e}")
        print("Ensure the column contains valid numeric strings.")
    return df

# Example for Task 1:
data_string_numbers = {'id': [1, 2, 3],
                       'values_str': ['10', '25', '100'],
                       'other_col': ['A', 'B', 'C']}
df_string = pd.DataFrame(data_string_numbers)
print("DataFrame with string numbers:")
print(df_string.dtypes)
df_int = convert_string_to_int(df_string.copy(), 'values_str')
print("\nDataFrame after conversion:")
print(df_int.dtypes)
print("-" * 30)

# --- Task 2: Identify and correct columns with inconsistent data types ---
def identify_and_correct_inconsistent_types(df):
    """
    Identifies and attempts to correct columns with inconsistent data types
    in a pandas DataFrame. This function will try to infer the best data type.

    Args:
        df: pandas DataFrame.

    Returns:
        A pandas DataFrame with potentially corrected data types.
    """
    print("Identifying and correcting inconsistent data types...")
    df_corrected = df.copy()
    for col in df_corrected.columns:
        original_dtype = df_corrected[col].dtype
        try:
            df_corrected[col] = pd.to_numeric(df_corrected[col], errors='ignore')
            if df_corrected[col].dtype != original_dtype:
                print(f"Column '{col}' changed from {original_dtype} to {df_corrected[col].dtype}")
        except Exception as e:
            print(f"Could not automatically correct type for column '{col}': {e}")
    print("Potential inconsistent data types addressed.")
    return df_corrected

# Example for Task 2:
data_inconsistent = {'id': [1, '2', 3],
                      'amounts': ['10.5', '20', 30.0],
                      'flags': [True, 'False', 1]}
df_inconsistent = pd.DataFrame(data_inconsistent)
print("DataFrame with inconsistent types:")
print(df_inconsistent.dtypes)
df_consistent = identify_and_correct_inconsistent_types(df_inconsistent.copy())
print("\nDataFrame after potential correction:")
print(df_consistent.dtypes)
print("-" * 30)

# --- Task 3: Discuss why correct data types are critical for feature engineering ---
def discuss_data_types_for_feature_engineering():
    """
    Provides a discussion on why correct data types are critical for feature engineering.
    """
    discussion = """
    Correct data types are absolutely critical for effective feature engineering for several reasons:

    1. **Enabling Mathematical Operations:** Many feature engineering techniques involve mathematical operations like arithmetic, aggregation (sum, mean, etc.), and comparisons. These operations can only be reliably performed on numeric data types (integers, floats). If a column containing numerical information is stored as a string, these operations will either fail or produce incorrect results. Converting to the correct numeric type is essential.

    2. **Facilitating Categorical Encoding:** Feature engineering often involves encoding categorical variables into a numerical format that machine learning models can understand (e.g., one-hot encoding, label encoding). To apply these encoding techniques correctly, the categorical columns need to be identified with the appropriate data type (e.g., 'object' or 'category' in pandas). Inconsistent or incorrect typing of categorical columns can lead to errors or ineffective encoding.

    3. **Supporting Date and Time Manipulation:** When dealing with time-based data, having the correct datetime data type is crucial for performing operations like calculating time differences, extracting specific time components (year, month, day, hour), and creating time-based features (e.g., lag features, rolling statistics). Incorrectly typed date or time columns (e.g., as strings or integers) will make these powerful feature engineering techniques difficult or impossible to implement correctly.

    4. **Ensuring Compatibility with Libraries and Models:** Machine learning libraries like scikit-learn often have specific data type requirements for their functions and models. Providing data with incorrect types can lead to errors during training or prediction. Ensuring correct types makes your data compatible with these tools.

    5. **Improving Efficiency and Reducing Memory Usage:** Correct data types can also impact the efficiency of your code and the memory usage of your data. For example, storing integers as 'float64' consumes more memory than storing them as 'int64'. Choosing the most appropriate data type can lead to more efficient feature engineering processes.

    In summary, correct data types form the foundation for meaningful and effective feature engineering. They ensure that the intended operations can be performed accurately, that categorical and time-based features can be manipulated correctly, and that the data is compatible with machine learning algorithms and libraries. Investing time in identifying and correcting data types is a crucial step in the data preprocessing pipeline.
    """
    print(discussion)

# Example for Task 3:
discuss_data_types_for_feature_engineering()

DataFrame with string numbers:
id             int64
values_str    object
other_col     object
dtype: object
Column 'values_str' successfully converted to integer.

DataFrame after conversion:
id             int64
values_str      int8
other_col     object
dtype: object
------------------------------
DataFrame with inconsistent types:
id         object
amounts    object
flags      object
dtype: object
Identifying and correcting inconsistent data types...
Column 'id' changed from object to int64
Column 'amounts' changed from object to float64
Potential inconsistent data types addressed.

DataFrame after potential correction:
id           int64
amounts    float64
flags       object
dtype: object
------------------------------

    Correct data types are absolutely critical for effective feature engineering for several reasons:

    1. **Enabling Mathematical Operations:** Many feature engineering techniques involve mathematical operations like arithmetic, aggregation (sum, mean, etc.), and

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# --- Task 1: Visualize and identify outliers using a boxplot ---
def visualize_outliers_boxplot(df, columns):
    """
    Visualizes the distribution of specified columns using boxplots to identify outliers.

    Args:
        df: pandas DataFrame.
        columns: A list of column names to visualize.
    """
    print("Visualizing outliers using boxplots...")
    plt.figure(figsize=(15, 6 * len(columns)))
    for i, column in enumerate(columns):
        if column in df.columns and pd.api.types.is_numeric_dtype(df[column]):
            plt.subplot(len(columns), 1, i + 1)
            sns.boxplot(x=df[column])
            plt.title(f'Boxplot of {column}')
        else:
            print(f"Warning: Column '{column}' is not numeric or not found.")
    plt.tight_layout()
    plt.show()

# Example for Task 1:
data_outliers = {'col1': [10, 12, 15, 18, 20, 100],
                  'col2': [5, 7, 9, 11, 13, -5],
                  'col3': ['A', 'B', 'A', 'C', 'B', 'A']}
df_outliers = pd.DataFrame(data_outliers)
visualize_outliers_boxplot(df_outliers, ['col1', 'col2'])
print("-" * 30)

# --- Task 2: Remove or adjust outliers and re-analyze ---
def handle_outliers_adjust_reanalyze(df, columns, method='remove', iqr_factor=1.5, zscore_threshold=3):
    """
    Removes or adjusts outliers in specified numeric columns and re-analyzes the dataset
    by printing descriptive statistics and visualizing boxplots.

    Args:
        df: pandas DataFrame.
        columns: A list of numeric column names to handle outliers in.
        method: 'remove' to remove outliers, 'cap' to cap them at the bounds,
                'median' to replace with the median. Default is 'remove'.
        iqr_factor: The factor to use for IQR-based outlier detection (if applicable). Default is 1.5.
        zscore_threshold: The Z-score threshold for outlier detection (if applicable). Default is 3.

    Returns:
        A pandas DataFrame with handled outliers.
    """
    df_handled = df.copy()
    print(f"\nHandling outliers using '{method}' method...")

    for column in columns:
        if column in df_handled.columns and pd.api.types.is_numeric_dtype(df_handled[column]):
            print(f"\nHandling outliers in column '{column}':")
            if method in ['remove', 'cap', 'median']:
                if method == 'remove':
                    # IQR based removal
                    Q1 = df_handled[column].quantile(0.25)
                    Q3 = df_handled[column].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_factor * IQR
                    upper_bound = Q3 + iqr_factor * IQR
                    df_handled = df_handled[(df_handled[column] >= lower_bound) & (df_handled[column] <= upper_bound)]
                    print("Outliers removed based on IQR.")
                elif method == 'cap':
                    # IQR based capping
                    Q1 = df_handled[column].quantile(0.25)
                    Q3 = df_handled[column].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_factor * IQR
                    upper_bound = Q3 + iqr_factor * IQR
                    df_handled[column] = np.where(df_handled[column] < lower_bound, lower_bound, df_handled[column])
                    df_handled[column] = np.where(df_handled[column] > upper_bound, upper_bound, df_handled[column])
                    print("Outliers capped based on IQR.")
                elif method == 'median':
                    # IQR based replacement with median
                    Q1 = df_handled[column].quantile(0.25)
                    Q3 = df_handled[column].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - iqr_factor * IQR
                    upper_bound = Q3 + iqr_factor * IQR
                    median_val = df_handled[column].median()
                    df_handled[column] = np.where((df_handled[column] < lower_bound) | (df_handled[column] > upper_bound), median_val, df_handled[column])
                    print("Outliers replaced with median based on IQR.")
            elif method == 'zscore':
                # Z-score based handling (replace with median)
                mean_val = df_handled[column].mean()
                std_dev = df_handled[column].std()
                if std_dev != 0:
                    z_scores = np.abs(stats.zscore(df_handled[column]))
                    outlier_indices = z_scores > zscore_threshold
                    median_val = df_handled[column].median()
                    df_handled.loc[outlier_indices, column] = median_val
                    print(f"Outliers (Z-score > {z

SyntaxError: unterminated string literal (detected at line 98) (572954103.py, line 98)