# Goal: Do both Data cleaning & Feature Extraction

In [1]:
import polars as pl
import seaborn as sns

In [2]:
path = r"C:\Users\Rudra\Desktop\rural-financial-inclusion-govt-scheme-recommendation\parquet-data\lev-05\data\lev-05_merged.parquet"

pdf = pl.read_parquet(path)
pdf.collect_schema()

Schema([('Survey_Name', String),
        ('Year', String),
        ('FSU_Serial_No', String),
        ('Sector', String),
        ('State', String),
        ('NSS_Region', String),
        ('District', String),
        ('Stratum', String),
        ('Sub_stratum', String),
        ('Panel', String),
        ('Sub_sample', String),
        ('FOD_Sub_Region', String),
        ('Sample_SU_No', String),
        ('Sample_Sub_Division_No', String),
        ('Second_Stage_Stratum_No', String),
        ('Sample_Household_No', String),
        ('Questionnaire_No', String),
        ('Level', String),
        ('Item_Code', String),
        ('OutOfHome_Consumption_Quantity', Float64),
        ('OutOfHome_Consumption_Value', Float64),
        ('Total_Consumption_Quantity', Float64),
        ('Total_Consumption_Value', Float64),
        ('Source', String),
        ('Multiplier', Int64)])

In [3]:
pdf = pdf.filter(
    pl.col('State') == '23'
)

pdf['State'].unique()

State
str
"""23"""


In [4]:
lev_05 = [
    'OutOfHome_Consumption_Quantity',
    'OutOfHome_Consumption_Value',
    'Total_Consumption_Quantity',
    'Total_Consumption_Value',
    'Source',
]

pdf = pdf[lev_05]

In [5]:
pdf

OutOfHome_Consumption_Quantity,OutOfHome_Consumption_Value,Total_Consumption_Quantity,Total_Consumption_Value,Source
f64,f64,f64,f64,str
,,0.5,200.0,"""1"""
,,15.0,,"""1"""
,,14.0,,"""1"""
,,1.5,75.0,"""1"""
,,0.1,15.0,"""1"""
…,…,…,…,…
,,,237.0,""""""
,,,60.0,""""""
,,0.1,14.0,""""""
,,415.0,129.0,""""""


In [6]:
pdf = pdf.with_columns(
    [pl.col(col).cast(pl.Int32, strict=False) for col in lev_05]
)
pdf.schema

Schema([('OutOfHome_Consumption_Quantity', Int32),
        ('OutOfHome_Consumption_Value', Int32),
        ('Total_Consumption_Quantity', Int32),
        ('Total_Consumption_Value', Int32),
        ('Source', Int32)])

In [7]:
pdf.null_count()

OutOfHome_Consumption_Quantity,OutOfHome_Consumption_Value,Total_Consumption_Quantity,Total_Consumption_Value,Source
u32,u32,u32,u32,u32
1289894,1277454,146144,36200,313468


In [8]:
pdf.null_count() / pdf.shape[0]

OutOfHome_Consumption_Quantity,OutOfHome_Consumption_Value,Total_Consumption_Quantity,Total_Consumption_Value,Source
f64,f64,f64,f64,f64
0.951485,0.942309,0.107803,0.026703,0.231228


In [9]:
def check_unique(col):
    print(f"{col} contains : {pdf[col].n_unique()}")
    display(pdf[col].unique())
    display(f" This {col} have the skewness {pdf[col].skew()}")
    print(f"{'='*50}")

In [10]:
def plot_hist(col):
    sns.histplot(pdf[col])

In [11]:
# plot_hist('OutOfHome_Consumption_Quantity')

In [12]:
for col in pdf.columns:
    check_unique(col)

OutOfHome_Consumption_Quantity contains : 150


OutOfHome_Consumption_Quantity
i32
""
0
1
2
3
…
625
650
1000
1150


' This OutOfHome_Consumption_Quantity have the skewness 10.383926032082329'

OutOfHome_Consumption_Value contains : 1011


OutOfHome_Consumption_Value
i32
""
0
1
2
3
…
3030
3400
3585
3750


' This OutOfHome_Consumption_Value have the skewness 2.024162344960687'

Total_Consumption_Quantity contains : 976


Total_Consumption_Quantity
i32
""
0
1
2
3
…
3125
3500
3950
4000


' This Total_Consumption_Quantity have the skewness 16.296559491808758'

Total_Consumption_Value contains : 2110


Total_Consumption_Value
i32
""
1
2
3
4
…
6275
7060
7800
8880


' This Total_Consumption_Value have the skewness 6.000721741115648'

Source contains : 8


Source
i32
""
1.0
2.0
3.0
4.0
5.0
6.0
9.0


' This Source have the skewness 12.327904106831683'



In [13]:
pdf.columns

['OutOfHome_Consumption_Quantity',
 'OutOfHome_Consumption_Value',
 'Total_Consumption_Quantity',
 'Total_Consumption_Value',
 'Source']

In [14]:
for col in pdf.columns:
    pdf = pdf.with_columns(
        pl.col(col).fill_null(strategy='forward')
    )

In [24]:
pdf.null_count()

OutOfHome_Consumption_Quantity,OutOfHome_Consumption_Value,Total_Consumption_Quantity,Total_Consumption_Value,Source
u32,u32,u32,u32,u32
5,5,0,0,0


In [23]:
pdf = pdf.with_columns(
    pl.col('OutOfHome_Consumption_Quantity').fill_null(strategy='forward')
)

pdf = pdf.with_columns(
    pl.col('OutOfHome_Consumption_Value').fill_null(strategy='forward')
)

In [25]:
pdf.write_csv(r"C:\Users\Rudra\Desktop\rural-financial-inclusion-govt-scheme-recommendation\parquet-data\lev-05\data2\lev_05_mp_clean.csv")
pdf.write_parquet(r"C:\Users\Rudra\Desktop\rural-financial-inclusion-govt-scheme-recommendation\parquet-data\lev-05\data2\lev_05_mp_clean.parquet", compression="zstd")
print('Saved 🙌')

Saved 🙌
