In [161]:
import pandas as pd
import numpy as np
from spicy import stats

### LOAD DATASET

In [162]:
def load_dataset(path):
    try:
        df=pd.read_csv(path)
        print(f"Dataset loaded successfully with shape: {df.shape}")
        return df 
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

### INTERACTIVE DELETE


In [None]:
def interactive_delete(df, rows, message=""):
    if rows.empty:
        print(f"No rows found for {message}.")
        return df
    
    print(f"Rows detected for {message}:")
    print(rows)

    choice = input(f"Do you want to delete these rows? (yes/no/range/indices): ").strip().lower()

    if choice == "yes":
        df = df.drop(rows.index).reset_index(drop=True)
        print(f"Deleted all rows for {message}. New shape: {df.shape}")
        print("\n")
    elif choice == "range":
        start = int(input("Enter start index (inclusive): "))
        end = int(input("Enter end index (inclusive): "))
        indices_to_delete = rows.index[(rows.index >= start) & (rows.index <= end)]
        df = df.drop(indices_to_delete).reset_index(drop=True)
        print(f"Deleted rows from {start} to {end}. New shape: {df.shape}")
        print("\n")
    elif choice == "indices":
        idx_list = input("Enter indices to delete (comma separated): ")
        indices = [int(x.strip()) for x in idx_list.split(",")]
        df = df.drop(indices).reset_index(drop=True)
        print(f"Deleted rows {indices}. New shape: {df.shape}")
        print("\n")
    else:
        print("No rows deleted.")

    return df

### REMOVING MISSING VALUES

In [164]:
def check_missing_values(df, col_name):
    if col_name not in df.columns:
        print("f: column 'col_name}' not found.")
        return 
    print("Checking column: {col_name}")
    col=df[col_name]

    #traverse row by row
    invalid_indices=[]
    for i,val in enumerate(col):
        if pd.isna(val):
            invalid_indices.append(i)
        elif isinstance(val,str) and val.strip()=="": 
            invalid_indices.append(i) 
        elif isinstance(val,str) and val.lower() in ["?","na","n/a","none"]:
            invalid_indices.append(i)

    if invalid_indices:
        rows = df.loc[invalid_indices]
        df = interactive_delete(df, rows, f"missing/invalid values in '{col_name}'")
    else:
        print(f"No missing or invalid values found in column '{col_name}'.")

    print("Check complete.")
    return df


### CHECK FOR OUTLIERS
If the data is unrealistic or very far away from avg and the rest of the values in the dataset. In that case, its necessary to once cross verify.

1. IQR Method: If data is skewed on one side.
eg. Income, Work hours, House prices, etc.

2. Z-Score Method: If data is symmetric/Normal.
eg. Height, Test Scores, etc.

#### IQR (Interquartile Range) Method

In [None]:
def check_outliers_iqr(df,column):
    if column not in df.columns:
        print(f"Column '{column}' not found.")
        return

    Q1=df[column].quantile(0.25) #value below which 25% of the data lies
    
    Q3=df[column].quantile(0.75) #value above which 75% of the data lies
    IQR=Q3-Q1  #Interquartile Range i.e. spread of the middle 50% of the data
   
    lower_bound=Q1-5*IQR  #any value smaller than lower bound is too far below the normal range
    
    upper_bound=Q3+5*IQR  #any value larger than upper bound is too far above the normal range
    #1.5 is a factor commonly used as a standard rule of thumb (Tukey's method)

    outliers=df[(df[column]<lower_bound) | (df[column]>upper_bound)]
    if outliers.empty:
        print(f"No outliers found in column '{column}'.")
    else:
        print(f"Outliers found in column '{column}':")
        print(outliers)

      # Now redirect to interactive_delete
        df = interactive_delete(df, outliers, f"outliers in '{column}'")

    return df


#### Z-Score


In [None]:

def detect_outliers_zscore(df, column, threshold=3.5):
    if column not in df.columns:
        print(f"Column '{column}' not found.")
        return df
    
   # Check if column is numeric
    if not np.issubdtype(df[column].dtype, np.number):
        print(f"Column '{column}' is not numeric. Z-score can only be applied to numeric columns.")
        return df
    
    z_scores = np.abs(stats.zscore(df[column], nan_policy='omit'))  # z=(data point-mean)/standard deviation
    outliers = df.loc[z_scores > threshold]  #So now z_scores is an array of how "extreme" each row’s value is compared to the column mean.
    if not outliers.empty:  #Rule of thumb: values beyond 3 standard deviations are "too extreme.
        print(f"Outliers found in column '{column}':")
        print(outliers)
        df = interactive_delete(df, outliers, message=f"Z-score outliers in '{column}'")

    else:
        print(f"No outliers found in column '{column}'.")


    return df

### ENSURING CORRECT DATA TYPES

In [None]:
def ensure_data_type(df,col_name):
    if col_name not in df.columns:
        print(f"Column '{col_name}' not found.")
        return df

    # Check data type
    if not pd.api.types.is_numeric_dtype(df[col_name]):
        current_dtype = df[col_name].dtype
        print(f"Column '{col_name}' is not numeric. Current dtype: {current_dtype}")
        
        # Ask user
        choice = input("Do you want to convert it to numeric? (y/n): ").strip().lower()

        if choice == "y":
            df[col_name] = pd.to_numeric(df[col_name], errors="coerce")
            print(f"Column '{col_name}' converted to numeric.")
        else:
            print(f"Column '{col_name}' kept as {current_dtype}.")
    else:
        print(f"Column '{col_name}' is already numeric.")
        
    # Confirm new type
    print(f"Final dtype of column '{col_name}': {df[col_name].dtype}")
    return df

#### REMOVE DUPLICATES

In [None]:
def remove_duplicates(df, col_name):
    if col_name not in df.columns:
        print(f"Column '{col_name}' not found.")
        return df
        
    duplicates = df[df.duplicated(subset=[col_name], keep=False)]
    if not duplicates.empty:
        duplicates = duplicates.sort_values(by=col_name)

        print(f"\nDuplicates found in column '{col_name}':")
        print(duplicates)  # Show the duplicates to the user
        df = interactive_delete(df, duplicates, f"duplicates in '{col_name}'")
    else:
        print(f"No duplicates found in column '{col_name}'.")    
    return df