In [29]:
import pandas as pd
import numpy as np
from spicy import stats

In [30]:
df=pd.read_csv("M25_DA_A1_Dataset1.csv")
df.head()

Unnamed: 0,50,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,38,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,53,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,28,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,37,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,49,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [31]:
df.tail()

Unnamed: 0,50,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
48838,38.0,Private,215419.0,Bachelors,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,0.0,0.0,36.0,United-States,<=50K.
48839,44.0,,321403.0,HS-grad,9.0,Widowed,,Other-relative,Black,Male,0.0,0.0,40.0,United-States,<=50K.
48840,35.0,Private,374983.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
48841,,Private,83891.0,Bachelors,13.0,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455.0,0.0,40.0,United-States,<=50K.
48842,,Self-emp-inc,182148.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,60.0,United-States,>50K.


## REMOVING MISSING VALUES

In [32]:
def check_missing_values(col_name):
    if col_name not in df.columns:
        print("f: column 'col_name}' not found.")
        return 
    print("Checking column: {col_name}")
    col=df[col_name]

    #traverse row by row
    for i,val in enumerate(col):
        if pd.isna(val):
            print(f"Row {i}: NULL/NaN value found.")
        elif isinstance(val,str) and val.strip()=="":  
            print(f"Row {i}: Empty string found.")
        elif isinstance(val,str) and val.lower() in ["?","na","n/a","none"]:
             print(f"Row {i}: Invalid string '{val}' found.")

    print("Check complete.")

check_missing_values("fnlwgt")


Checking column: {col_name}
Row 32561: NULL/NaN value found.
Check complete.


## CHECK FOR OUTLIERS
If the data is unrealistic or very far away from avg and the rest of the values in the dataset. In that case, its necessary to once cross verify.

1. IQR Method: If data is skewed on one side.
eg. Income, Work hours, House prices, etc.

2. Z-Score Method: If data is symmetric/Normal.
eg. Height, Test Scores, etc.

#### IQR (Interquartile Range) Method

In [35]:
def check_outliers_iqr(df,column):
    Q1=df[column].quantile(0.25) #value below which 25% of the data lies
    Q3=df[column].quantile(0.75) #value above which 75% of the data lies
    IQR=Q3-Q1  #Interquartile Range i.e. spread of the middle 50% of the data

    lower_bound=Q1-1.5*IQR  #any value smaller than lower bound is too far below the normal range
    upper_bound=Q3+1.5*IQR  #any value larger than upper bound is too far above the normal range
    #1.5 is a factor commonly used as a standard rule of thumb (Tukey's method)

    outliers=df[(df[column]<lower_bound) | (df[column]>upper_bound)]
    if outliers.empty:
        print(f"No outliers found in column '{column}'.")
    else:
        print(f"Outliers found in column '{column}':")
        print(outliers)

check_outliers_iqr(df,"fnlwgt")

Outliers found in column 'fnlwgt':
       50          Workclass    fnlwgt      Education  Education_Num  \
37     48            Private  544091.0        HS-grad            9.0   
40     24            Private  507875.0            9th            5.0   
80     46            Private  446839.0        HS-grad            9.0   
110    56            Private  432376.0      Bachelors           13.0   
157    42   Self-emp-not-inc  494223.0   Some-college           10.0   
...    ..                ...       ...            ...            ...   
48678  34          State-gov  427515.0        HS-grad            9.0   
48735  55            Private  607658.0      Bachelors           13.0   
48741  35            Private  422933.0        Masters           14.0   
48822  51            Private  430340.0   Some-college           10.0   
48834  48            Private  440129.0        HS-grad            9.0   

            Martial_Status          Occupation     Relationship    Race  \
37       Married-AF-spous

#### Z-Score


In [36]:
# Example: assuming df is your dataframe
df_numeric = df.select_dtypes(include=[np.number])  # only numeric columns

# Calculate Z-scores
z_scores = np.abs(stats.zscore(df_numeric, nan_policy='omit'))

# Define threshold (commonly 3)
threshold = 3

# Create a mask for outliers
outlier_mask = (z_scores > threshold)

# Find rows that have ANY outlier across numeric columns
outlier_rows = df_numeric[outlier_mask.any(axis=1)]

print(f"Number of outlier rows: {len(outlier_rows)}")
print(outlier_rows.head())

Number of outlier rows: 3971
      fnlwgt  Education_Num  Capital_Gain  Capital_Loss  Hours_per_week
10  280464.0           10.0           0.0           0.0            80.0
23  117037.0            7.0           0.0        2042.0            40.0
28  367260.0            9.0           0.0           0.0            80.0
32  386940.0           13.0           0.0        1408.0            40.0
37  544091.0            9.0           0.0           0.0            25.0


#### ENSURING CORRECT DATA TYPES

In [39]:

def ensure_data_type(col_name):

    # Check data type
    if not pd.api.types.is_numeric_dtype(df[col_name]):
        # Convert to numeric (coerce errors → invalid parsing becomes NaN)
        df[col_name] = pd.to_numeric(df[col_name], errors="coerce")

    # Confirm new type
    print(df[col_name].dtype)

ensure_data_type("fnlwgt")

float64


In [40]:
print(df.shape)

def remove_duplicates(df):
    df=df.drop_duplicates()
    print("Duplicates removed.")
    df=df.reset_index(drop=True) #reset index after removing rows
    print(df.shape)
    return df

remove_duplicates(df)

(48843, 15)
Duplicates removed.
(48841, 15)


Unnamed: 0,50,Workclass,fnlwgt,Education,Education_Num,Martial_Status,Occupation,Relationship,Race,Sex,Capital_Gain,Capital_Loss,Hours_per_week,Country,Target
0,38,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,53,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,28,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,37,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,49,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,38,Private,215419.0,Bachelors,13.0,Divorced,Prof-specialty,Not-in-family,White,Female,0.0,0.0,36.0,United-States,<=50K.
48837,44,,321403.0,HS-grad,9.0,Widowed,,Other-relative,Black,Male,0.0,0.0,40.0,United-States,<=50K.
48838,35,Private,374983.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
48839,,Private,83891.0,Bachelors,13.0,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455.0,0.0,40.0,United-States,<=50K.
