In [12]:
import pandas as pd 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', '{:.6f}'.format)
test_df = pd.read_csv('data/test.csv')



#### 1. remove those columns with missing data in > 50% cases 
- only x_train 

In [None]:
result = (test_df.isnull().mean() * 100).sort_values(ascending=False)
# print(result)
columns_to_drop = result.loc[result > 47].index # 47% as threshold for this data set
column_to_drop_list = columns_to_drop.tolist()
print(column_to_drop_list)
test_df = test_df.drop(columns=column_to_drop_list)



#### 2. Handle Remaining Missing Values


In [None]:
from sklearn.impute import SimpleImputer


# step1 extract columns with missing values and divide into numerical and categorical columns

result =  ((test_df.isnull().sum()).sort_values(ascending=False)) 
columns_with_missing_values = result.loc[result > 0]
# instead of above - better since i'm looking for columns with missing values
columns_with_missing_values = test_df.columns[test_df.isnull().any()]
# print(f"Colums with missing values {columns_with_missing_values}, type {type(columns_with_missing_values)}")
null_cols_df= test_df[columns_with_missing_values]
# print(f"Only null columns df: {null_cols_df}")

null_cols_types = null_cols_df.dtypes
# print(f"Null columns types: \n{null_cols_types}")


num_cols = null_cols_df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = null_cols_df.select_dtypes(include=['object']).columns
print(f"cat num_cols: {num_cols}")

# step 2: create imputers for numerical and categorical columns

num_imputer = SimpleImputer(strategy='mean') # for numerical columns 
cat_imputer = SimpleImputer(strategy='most_frequent') # for categorical columns 

# step 3 apply imputers to columns with missing values

test_df[num_cols] = num_imputer.fit_transform(test_df[num_cols])
test_df[cat_cols] = cat_imputer.fit_transform(test_df[cat_cols])



#### 3. convert categorical variables

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_cols = test_df.select_dtypes(include=['object']).columns
cat_cols

# this is simple aproach to be improved later 
for col in cat_cols:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col])

test_df.info()
    

#### 5. Feature scaling 


In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# train_df.drop(columns=['SalePrice'], inplace=True)


test_ids = test_df['Id'].copy()
test_df_no_id = test_df.drop('Id', axis=1)

# print(train_df.head())
test_df_no_id_scaled_array = scaler.fit_transform(test_df_no_id)
test_df_no_id_scaled = pd.DataFrame(test_df_no_id_scaled_array, columns=test_df_no_id.columns)

test_df_no_id_scaled['Id'] = test_ids.values

test_df = test_df_no_id_scaled.copy() 

test_df.head()

# save the data to csv
test_df.to_csv('data/X_test_cleaned.csv', index=False)
