In [119]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [120]:
# Load the CSV file into a DataFrame
df = pd.read_csv('/content/data.csv')

In [121]:

columns_to_drop = ['sumins_grp_94baec', 'prempaid_inv_dcd836', 'prempaid_lh_d0adeb',
                   'prempaid_gi_a10d1b', 'prempaid_gi_29d435', 'prempaid_gi_856320',
                   'prempaid_gi_058815', 'prempaid_32c74c', 'ape_gi', 'sumins_gi',
                   'prempaid_d0adeb', 'prempaid_gi', 'sumins_d0adeb', 'sumins_e22a6a',
                   'ape_d0adeb', 'prempaid_gi_42e115', 'prempaid_ltc_1280bf', 'sumins_32c74c',
                   'sumins_gi_058815', 'ape_ltc_1280bf', 'ape_gi_42e115', 'ape_inv_dcd836',
                   'ape_lh_d0adeb', 'ape_gi_a10d1b', 'ape_gi_29d435', 'ape_gi_856320',
                   'ape_gi_058815', 'ape_32c74c', 'sumins_gi_42e115', 'sumins_ltc_1280bf',
                   'sumins_inv_dcd836', 'sumins_lh_d0adeb', 'sumins_grp_22decf',
                   'sumins_gi_a10d1b', 'sumins_gi_29d435', 'sumins_lh_e22a6a', 'sumins_grp_e04c3a',
                   'sumins_gi_856320']
# Drop the specified columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True)


df.dropna(subset=['flg_substandard'], inplace=True)
df.dropna(subset=['race_desc'], inplace=True)
values_to_drop = ["unknown country code", "not applicable", ""]
df = df[~df['ctrycode_desc'].isin(values_to_drop)]

In [122]:
df['is_consent'] = (df['is_consent_to_mail'] + df['is_consent_to_email'] +
                    df['is_consent_to_call'] + df['is_consent_to_sms']) > 2

# Convert boolean values to 1s and 0s
df['is_consent'] = df['is_consent'].astype(int)
df['is_consent']

0        0
1        0
2        0
3        0
4        1
        ..
17986    1
17988    0
17989    0
17990    0
17991    0
Name: is_consent, Length: 13636, dtype: int64

In [123]:
df['is_valid'] = (df['is_valid_dm'] + df['is_valid_email']) > 1

# Convert boolean values to 1s and 0s
df['is_valid'] = df['is_valid'].astype(int)

In [124]:
# repalce nan as 6
df['hh_size_est'] = df['hh_size_est'].fillna(6)

# replace '>4' as 5
df['hh_size_est'] = df['hh_size_est'].replace('>4', 5)

# Convert the column to integer type
df['hh_size_est'] = df['hh_size_est'].astype(int)

In [125]:
income_mapping = {
    'A.ABOVE200K': 1,
    'B.100K-200K': 2,
    'C.60K-100K': 3,
    'D.30K-60K': 4,
    'E.BELOW30K': 5
}

# Map the 'annual_income_est' column using the defined mapping
df['annual_income_est'] = df['annual_income_est'].map(income_mapping)

# Replace NaN values with 6
df['annual_income_est'].fillna(6, inplace=True)

# Convert the column to integer type
df['annual_income_est'] = df['annual_income_est'].astype(int)

In [126]:
ape_columns = df.filter(regex=r'^ape_(?!lapse_).*')
# Sum values along columns to create the new "total_ape" column
df['total_ape'] = ape_columns.sum(axis=1)

In [127]:
sumins_col = df.filter(like='sumins_')
df['total_sumins'] = sumins_col.sum(axis=1)

In [128]:
prempaid_col = df.filter(like='prempaid_')
df['total_prempaid'] = prempaid_col.sum(axis=1)

In [129]:
# Drop rows with 'None' values in the 'cltdob_fix' column
df = df[df['cltdob_fix'] != 'None']

# Split the "cltdob_fix" column into year, month, and day components
df['cltdob_fix'] = pd.to_datetime(df['cltdob_fix'], errors='coerce')  # Convert to datetime and handle invalid dates

# Calculate the current date
current_date = datetime.now()

# Define a function to calculate age
def calculate_age(row):
    if pd.isnull(row['cltdob_fix']):
        return pd.NA
    birth_date = row['cltdob_fix']
    age = current_date.year - birth_date.year - ((current_date.month, current_date.day) < (birth_date.month, birth_date.day))
    return age

# Apply the function to calculate age and create the 'age' column
df['age'] = df.apply(calculate_age, axis=1)

# Drop the intermediate columns 'year', 'month', and 'day' if not needed
df.drop(['cltdob_fix'], axis=1, inplace=True)

In [131]:
df['stat_flag_encoded'] = label_encoder.fit_transform(df['stat_flag'])

In [133]:
# Encode the 'cltsex_fix' column
df['cltsex_encoded'] = label_encoder.fit_transform(df['cltsex_fix'])