In [1]:
import pandas as pd
import numpy as np
from spicy import stats

### LOAD DATASET

In [None]:
def load_dataset(path):
    try:
        df=pd.read_csv(path)
        print(f"Dataset loaded successfully with shape: {df.shape}")
        return df 
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

### REMOVING MISSING VALUES

In [11]:
def check_missing_values(df, col_name):
    if col_name not in df.columns:
        print("f: column 'col_name}' not found.")
        return 
    print("Checking column: {col_name}")
    col=df[col_name]

    #traverse row by row
    for i,val in enumerate(col):
        if pd.isna(val):
            print(f"Row {i}: NULL/NaN value found.")
        elif isinstance(val,str) and val.strip()=="":  
            print(f"Row {i}: Empty string found.")
        elif isinstance(val,str) and val.lower() in ["?","na","n/a","none"]:
             print(f"Row {i}: Invalid string '{val}' found.")

    print("Check complete.")

### CHECK FOR OUTLIERS
If the data is unrealistic or very far away from avg and the rest of the values in the dataset. In that case, its necessary to once cross verify.

1. IQR Method: If data is skewed on one side.
eg. Income, Work hours, House prices, etc.

2. Z-Score Method: If data is symmetric/Normal.
eg. Height, Test Scores, etc.

#### IQR (Interquartile Range) Method

In [12]:
def check_outliers_iqr(df,column):
    if column not in df.columns:
        print(f"Column '{column}' not found.")
        return

    Q1=df[column].quantile(0.25) #value below which 25% of the data lies
    Q3=df[column].quantile(0.75) #value above which 75% of the data lies
    IQR=Q3-Q1  #Interquartile Range i.e. spread of the middle 50% of the data

    lower_bound=Q1-1.5*IQR  #any value smaller than lower bound is too far below the normal range
    upper_bound=Q3+1.5*IQR  #any value larger than upper bound is too far above the normal range
    #1.5 is a factor commonly used as a standard rule of thumb (Tukey's method)

    outliers=df[(df[column]<lower_bound) | (df[column]>upper_bound)]
    if outliers.empty:
        print(f"No outliers found in column '{column}'.")
    else:
        print(f"Outliers found in column '{column}':")
        print(outliers)

#### Z-Score


In [None]:
def detect_outliers_zscore_columnwise(df, threshold=3):
    df_numeric = df.select_dtypes(include=[np.number])
    outliers_dict = {}


    for col in df_numeric.columns:
        z_scores = np.abs(stats.zscore(df_numeric[col], nan_policy='omit'))
        outliers = df_numeric[z_scores > threshold]
        if not outliers.empty:
            outliers_dict[col] = outliers
            print(f"Outliers found in column '{col}':")
            print(outliers)
        else:
            print(f"No outliers found in column '{col}'.")


    return outliers_dict

### ENSURING CORRECT DATA TYPES

In [15]:

def ensure_data_type(df,col_name):
    if col_name not in df.columns:
        print(f"Column '{col_name}' not found.")
        return df

    # Check data type
    if not pd.api.types.is_numeric_dtype(df[col_name]):
        # Convert to numeric (coerce errors → invalid parsing becomes NaN)
        df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
        print(f"Converted column '{col_name}' to numeric.")

    # Confirm new type
    print(df[col_name].dtype)
    return df

#### REMOVE DUPLICATES

In [16]:
def remove_duplicates(df, col_name):
    if col_name not in df.columns:
        print(f"Column '{col_name}' not found.")
        return df
        
    before = df.shape[0]
    df = df.drop_duplicates(subset=[col_name]).reset_index(drop=True)
    after = df.shape[0]
    print(f"Duplicates removed based on column '{col_name}': {before - after}. New shape: {df.shape}")
    return df