In [None]:
import pandas as pd 
train = pd.read_csv('data/train.csv')

train.head()


# train.info()
train.describe()


In [None]:
# check count of  missing values
missing_values = train.isnull().sum()
missing_values.sort_values(ascending=False)


# check percentage of missing values 

missing_values_percentage = train.isnull().mean() * 100 
missing_values_percentage.sort_values(ascending=False)
# remove features where missing values are more than 50%
# PoolQC           99.794380
# MiscFeature      96.504455
# Alley            92.666210
# Fence            80.123372
# MasVnrType       61.274846
# FireplaceQu      50.034270

# Output values verification

In [None]:
# Statistical summary
print(train['SalePrice'].describe())

# Visualization with histogram
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(train['SalePrice'], kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

## verify skewness 

In [None]:
from scipy import stats
import numpy as np
skewness = train['SalePrice'].skew()
print(f"Skewness: {skewness}")

# skewness is positive, so the distribution is right-skewed

# log transformation
target = np.log(train['SalePrice'])

# present 
plt.figure(figsize=(10, 6))
sns.histplot(target, kde=True)
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Categorical values verification 

#### Verify singular colums 

In [None]:
X_train  = pd.read_csv("data/X_train_cleaned.csv")
# X_train  = pd.read_csv("data/train.csv",keep_default_na=True)

column_name = "LotFrontage"
print(f"Data type: {X_train[column_name].dtype}")
print(f"NaN count: {X_train[column_name].isna().sum()}")
print("Unique values:")
print(X_train[column_name].value_counts(dropna=False))
print("\n--Mode:")
X_train[column_name].mode()


In [None]:
X_train  = pd.read_csv("data/train.csv", keep_default_na=False, na_values=[])

#Fix column to be treated as int
X_train["LotFrontage"] = pd.to_numeric(X_train["LotFrontage"], errors='coerce')
X_train["MasVnrArea"] = pd.to_numeric(X_train["LotFrontage"], errors='coerce')
X_train["GarageYrBlt"] = pd.to_numeric(X_train["LotFrontage"], errors='coerce')



cat_cols = X_train.select_dtypes(include=['object']).columns
cat_info = {}
# print(cat_cols)
# print(X_train["Alley"].value_counts(dropna=False))


for col in cat_cols:    
    unique_vals =  X_train[col].value_counts(dropna=False).shape[0]
    missing_vals = X_train[col].isna().sum()
    sample_vals = X_train[col].value_counts().head(3).index.tolist()
    
    cat_info[col] = {
        'unique_values': unique_vals,
        'missing_values': missing_vals,
        'examples': sample_vals
    }
    # print(cat_info[col])


# Categorize features based on properties
binary_features = []
ordinal_features = []
low_card_nominal = []
high_card_features = []
tbd_features = []

ordinal_features_cols = ['Alley', 'LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence']

for col, info in cat_info.items():
    if col in ordinal_features_cols:
        ordinal_features.append(col)
    elif info['unique_values'] == 2:
        binary_features.append(col)
        # print(f"Binary feature: {col}, {info['examples']}")

    elif info['unique_values'] <= 10:
        low_card_nominal.append(col)
    elif info['unique_values'] > 10 and info['unique_values'] <= 25:
        tbd_features.append(col)       
    elif info['unique_values'] > 25:
        high_card_features.append(col)

print (f"\nBinary features: {binary_features}\n")
print (f"Ordinal features: {ordinal_features}\n")
print (f"Low cardinality nominal features: {low_card_nominal}\n")
print (f"High cardinality features: {high_card_features}\n")
print (f"Features to be determined: {tbd_features}\n")






# Verify if data is good for Gradient Descent

#### X_train

In [None]:
import pandas as pd
import numpy as np

print("======= Train set verification =========\n")
X_train = pd.read_csv("data/X_train_cleaned.csv")

X_train.drop(columns=['Id'], inplace=True)
# Select only numeric columns

# Check if your scaled data is actually scaled properly
print("X_train mean:", X_train.mean().mean())
print("X_train std:", X_train.std().mean())

# previous results : 
# X_train mean: 9.871621621621621
# X_train std: 6.684257057008759
print("\n======= Test set verification =========\n")

X_test = pd.read_csv("data/X_test_cleaned.csv")
X_test.drop(columns=['Id'], inplace=True)
print("X_test mean:", X_test.mean().mean())
print("X_test std:", X_test.std().mean())

# test NaN values
print(f"Checking for NA: {X_test.isnull().any().sum()}")

# col = X_test['MSSubClass']
# print(f"mean : {col.mean()}")
# print(f"std : {col.std()}")
# print(f"min : {col.min()}")
# print(f"max : {col.max()}")



#### y_train

In [None]:

y_train = pd.read_csv("data/y_train.csv")
print("y_train min:", y_train.min())
print("y_train max:", y_train.max())
print("y_train mean:", y_train.mean())
