### Read the dataframe by brute-force and check the reformating column names.

In [453]:
import polars as pl
import re
import numpy as np


def reformat_column_names(col_name: str) -> str:
    """A function for reformating the names of the columns. If some column' name is made up of words, seperate these words by _."""

    reformatted_name = re.sub(r"(?<![A-Z])(?<!^)([A-Z])", repl = r"_\1", string = col_name)



    return reformatted_name

laptop_data : pl.DataFrame = pl.read_csv(source = "laptopData.csv", # Read the data by brute force.
                                         ignore_errors = True,
                                         null_values = ["?"])


data_cols: list[str] = laptop_data.columns

data_cols[data_cols.index("Unnamed: 0")] = "ID"

laptop_data.columns = np.vectorize(reformat_column_names)(data_cols)

data_cols: list[str] = laptop_data.columns

### Dealing with missing values.

In [454]:
def null_values_count(df:pl.DataFrame) -> list[str]:
    '''For each column of the dataframe, tell me the number of missing values.
    Return the list of columns with missing values.
    '''

    null_columns:list[str] = []

    for col in df.columns: # #Iterate over the columns
        n_nulls: int = df[col].null_count()

        if n_nulls > 0:
            null_columns.append(col)


    return null_columns





def drop_missing_values(df:pl.DataFrame) -> pl.DataFrame:
    """Drop the missing rows (that is - rows with all columns missing).
    Moreover, for each column find the number of missing values.
    """

    df_dropped = df.filter(~pl.all_horizontal(pl.all().is_null()))

    null_columns = null_values_count(df_dropped)
    
  
    return df_dropped, null_columns
  
    

laptop_data, null_columns = drop_missing_values(laptop_data)
dtypes: list[ pl.DataType] = [pl.Float64(), pl.String(), pl.Float64()]

print(null_columns)
print(dtypes)

['Inches', 'Memory', 'Weight']
[Float64, String, Float64]


### Dealing with rows having less missing values.

In [455]:

def impute_missing_values(df:pl.DataFrame, null_cols: list[str], dtypes:list[pl.DataType]) -> pl.DataFrame:
    for id, col_name in enumerate(null_cols):
        col:pl.Series = laptop_data[col_name]

        if  not col.dtype.is_numeric():
            imputed_col: pl.Series = laptop_data[col_name].str.replace(r"kg", "")
          
            if dtypes[id] == pl.String():
                imputed_col: pl.Series = imputed_col.fill_null(strategy = "backward")
            else:
                imputed_col: pl.Series = imputed_col.cast(dtypes[id]).fill_null(strategy = "mean")

        else:
            imputed_col: pl.Series = laptop_data[col_name].cast(dtypes[id]).fill_null(strategy = "mean")

        df = df.with_columns(imputed_col)

    return df

            

laptop_data:pl.DataFrame = impute_missing_values(laptop_data, null_columns, dtypes)

### Replacing the row index column with a new identification column.

In [456]:
laptop_data = laptop_data.drop("ID").with_row_index(name = "ID")