In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', None)

In [None]:
from sklearn.model_selection import train_test_split


df = pd.read_csv("../data/train.csv")


In [None]:
df.info()

# Data Cleansing

#### ðŸ§¹ Handling Missing Values (Feature Absence)


In [None]:
null_cols = df.columns[df.isnull().any()]
null_df = df[null_cols].copy()


In [None]:
# for col in null_df.select_dtypes(include='object').columns:
# 	print(null_df[col].value_counts(dropna=False))
# 	print()

In [None]:
null_df.info()

In [None]:
cat_null_cols = null_df.select_dtypes(include='object').columns
num_null_cols = null_df.select_dtypes(include='number').columns


bsmt_null_cols = cat_null_cols[cat_null_cols.str.contains('Bsmt')]
garage_null_cols = cat_null_cols[cat_null_cols.str.contains('Garage')]


dict = {
    'Alley': 'NoAlley',
    'Electrical': df.Electrical.mode()[0],
    'FireplaceQu': df.FireplaceQu.mode()[0],
    'Fence': 'NoFence',
    'MiscFeature': 'None',
    'PoolQC': 'NoPool',
    'MasVnrType': 'None',
    **{bsmt_col: 'NoBsmtAvl' for bsmt_col in bsmt_null_cols},
    **{garage_col: 'NoGarageAvl' for garage_col in garage_null_cols},
    **{num_col: int(null_df[num_col].mean()) for num_col in num_null_cols}
}


null_df.fillna(dict, inplace=True)
df.update(null_df) # No Missing Value left data is fully cleansed

In [None]:
null_df.mean(numeric_only=True)

In [None]:
df.drop(columns='Id', inplace=True)

#### EDA

In [None]:
# df.select_dtypes(include='number')
df.info()

In [None]:
df[['MSSubClass', 'OverallQual', 'OverallCond', 'MoSold']] = df[['MSSubClass', 'OverallQual', 'OverallCond', 'MoSold']].astype('object')

In [None]:

hn =  df.select_dtypes(include='number').corr()
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(hn, cmap='Blues', ax=ax)
plt.title('Correlation b/w numerical cols')
plt.show()

Interpretation:

    X-axis: Correlation values (from -1 to 1).

    Y-axis: Number of features that have that correlation with the current feature.

So, for each feature:

    A peak near 1 â†’ Strong positive correlation with many features.

    A peak near 0 â†’ Mostly uncorrelated.

    A peak near -1 â†’ Strong negative correlation.

It helps you see how strongly each feature is generally related to others.

In [None]:
# hn.hist(figsize=(16, 14), bins=50, xlabelsize=8, ylabelsize=8)

In [None]:
cors = df.corr(numeric_only=True)['SalePrice'].sort_values(ascending=False).to_frame()

In [None]:
# Plot heatmap
fig, ax = plt.subplots(figsize=(4,7))
sns.heatmap(cors, annot=True, cmap='coolwarm', ax=ax)
plt.title('Correlation with SalePrice')
plt.show()

**feature selection numerical**

In [None]:
high_corr_features = cors.query("SalePrice > 0.45") #columns(features) highly correlated to SalePrice
high_corr_features.iloc[1:]


In [None]:
# fig, ax = plt.subplots(figsize=(18,10))
g = sns.pairplot(df, y_vars='SalePrice', x_vars=high_corr_features.index, kind='scatter')
g.figure.set_size_inches(20,6)

In [None]:
sns.boxplot(x=df['SalePrice'])

In [None]:
from sklearn.feature_selection import f_regression

y = df['SalePrice']
categorical_df = df.select_dtypes(include='object')
X_encoded = pd.get_dummies(categorical_df, drop_first=True)
f_scores, _ = f_regression(X_encoded, y)
pd.Series(f_scores, index=X_encoded.columns).sort_values(ascending=False)

In [None]:
Q3 = df['SalePrice'].quantile(0.75)
Q1 = df['SalePrice'].quantile(0.25)
IQR = Q3 - Q1

iqr_df = df[(df['SalePrice'] > Q1 - 1.5*IQR) & (df['SalePrice'] < Q3 + 1.5*IQR)]
final_df = iqr_df[high_corr_features.index]
final_df

In [None]:
set(df.dtypes.tolist())

In [None]:
# df_cat = df.select_dtypes(include=['object', 'category'])
# cat_cols = df_cat.columns.tolist()
# n_cols = 4
# n_rows = (len(cat_cols) // n_cols + 1)

# fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
# fig.subplots_adjust(hspace=0.4, wspace=0.3)

# for i, col in enumerate(cat_cols):
# 	ax = axes[i // n_cols, i % n_cols] if len(cat_cols) > n_cols else axes[i]
# 	sns.boxplot(x=col, y='SalePrice', data=df, ax=ax)
# 	ax.set_title(f'Price by {col}')
# 	ax.tick_params(axis='x', rotation=45)

# for j in range(i + 1, n_rows * n_cols):
# 	fig.delaxes(axes.flatten()[j])
	

In [None]:
sns.boxplot(x=df.GarageCars, y=df.SalePrice)

In [None]:
final_df.isna().sum()

## Training

In [None]:
train_df, test_df = train_test_split(final_df, random_state=23, test_size=0.2)

In [None]:
x = train_df.iloc[:, 1:]
y = train_df.SalePrice

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X=x, y=y)

In [None]:
test_df_X = test_df.drop("SalePrice", axis=1)
preds = lr.predict(test_df_X)

test_df

In [None]:
from sklearn.metrics import r2_score

r2_score(y_pred=preds, y_true=test_df.SalePrice)

In [None]:
# test_df = pd.read_csv("/home/tesserxt/AllProjects/jupyter/campusx/content/house_price/test.csv")
# # imp_test_df = test_df[imp_num_cols.drop('SalePrice').index]
# # imp_test_df[imp_test_df.isna().any(axis=1)]
# # imp_test_df.fillna(imp_test_df.mean(), inplace=True)




# Submission

In [None]:
test_file_path = "/home/tesserxt/AllProjects/jupyter/campusx/content/house_price/test.csv"
test_df = pd.read_csv(test_file_path)
test_data = test_df[high_corr_features.drop("SalePrice").index]
test_data.fillna({'MasVnrArea': 103, 'GarageYrBlt': 1978, "GarageArea": 3.17, 'TotalBsmtSF': 2, 'GarageCars': 243}, inplace=True)
test_data.dropna(how='all', axis=1)
# test_data.isnull().sum().sort_values(ascending=False)
ids = test_df.pop('Id')
preds = lr.predict(test_data)
	
output = pd.DataFrame({'Id': ids,
				   'SalePrice': preds.squeeze()})

output.to_csv('submission.csv', index=False)
test_data.isna().sum()


In [None]:
test_data

# Insurance dataset

In [None]:
df = pd.read_csv("/home/tesserxt/AllProjects/jupyter/campusx/content/insurance_data.csv")
# df['gender'] = df['gender'].map({'male': 1, 'female': 0}).astype('UInt8')
# df['diabetic'] = df['diabetic'].map({'Yes': 1, 'No': 0}).astype('UInt8')
# df['smoker'] = df['smoker'].map({'Yes': 0, 'No': 1}).astype('UInt8')

# region_map = {'southeast': 0, 'northeast': 1, 'northwest': 2, 'southwest': 3}
# df['region'] = df['region'].map(region_map).astype('UInt8')
# df.insert(len(df.columns) - 1, 'bmi_category', df['bmi'].apply(categorize_bmi))
# df.dropna(inplace=True, ignore_index=True)

In [None]:
df

In [None]:
X = df.iloc[:,[2,3,5,6,7,8,9,10]]
y = df.iloc[:, -1]
X

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pd.Series(lr.coef_, index = X.columns)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print(
	r2_score(y_pred=y_pred, y_true=y_test)
)

In [None]:
y_test

In [None]:
y_pred