In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/housing-price-prediction-data/housing_price_dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
# 모든 변수 결측치 확인
df.isnull().sum()

In [None]:
# object 타입(동네 유형) value 개수
df['Neighborhood'].value_counts()

In [None]:
# Pie Chart로 확인
plt.figure(figsize=(5, 5))
explode = [0.03, 0.03, 0.03]
colors = ['#9BB8CD', '#FFF7D4', '#EEC759']

plt.pie(df["Neighborhood"].value_counts(), labels=df["Neighborhood"].value_counts().index, autopct='%1.1f%%', startangle=90, explode=explode, colors=colors, shadow=True)
plt.title('Type of Neighborhoods')
plt.show()

In [None]:
# Price 분포 확인
plt.boxplot(df["Price"])
plt.title('Price')
plt.show()

In [None]:
# 데이터 전체 통계
df.describe()

In [None]:
# 라벨인코딩 (object -> int형 변환)
LabelEncoder = LabelEncoder()
df['Neighborhood'] = LabelEncoder.fit_transform(df['Neighborhood'])
df.head()

In [None]:
# 원핫인코딩으로 진행 시
# df = pd.get_dummies(df, dtype=int)
# df.head()

In [None]:
# 변수 간 상관관계 확인
df.corr()

In [None]:
# Heatmap으로 상관관계 확인 
plt.rcParams['figure.figsize'] = (5, 5)
sns.heatmap(df.corr(), annot=True, annot_kws={'size': 7}, fmt='.5f', cmap='RdPu')

In [None]:
# 데이터 Train, Test 나누기
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Price', axis=1), df['Price'], test_size=0.2, random_state=1234
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
X_train.head(3)

In [None]:
y_train.head(3)

In [None]:
y_train.info()

In [None]:
models = [
    LinearRegression(),
    Lasso(random_state=1234),
    RandomForestRegressor(random_state=1234),
    GradientBoostingRegressor(random_state=1234),
    KNeighborsRegressor(),
    XGBRegressor(random_state=1234)
]

In [None]:
# best model 찾기
best_loss = None
best_model = None
best_score = None

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    print(f'    {model.__class__.__name__}')
    print(f'    R2_score: {round(r2, 4)}    |    RMSE: {round(rmse, 6)}')
    print('=' * 60)

    if best_loss != None:
        if best_score < r2:
            best_model = model
            best_score = r2
            best_loss = rmse

    else:
        best_model = model
        best_score = r2
        best_loss = rmse

print(f'\nBest Model : {best_model.__class__.__name__} \n    R2_score: {best_score}    |    RMSE: {best_loss}')

In [None]:
# best model 기준 예측값 저장
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# 성능 평가 결과 (r2, rmse)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print('R2_score: ', r2)
print('RMSE:     ', rmse)

# 예측값 저장
predict = pd.DataFrame({
    'Neighborhood': X_test['Neighborhood'],
    'Price': y_pred
})

In [None]:
predict.shape

In [None]:
predict.head()

In [None]:
# X_train 데이터셋의 y_train값 평균
train_mean_price = pd.DataFrame([X_train['Neighborhood'], y_train])
train_mean_price = train_mean_price.T
train_mean_price['Neighborhood']=train_mean_price['Neighborhood'].astype('int64')

# X_test 데이터셋의 y_test값 평균 
xtest_mean_price = pd.DataFrame([X_test['Neighborhood'], y_test])
xtest_mean_price = xtest_mean_price.T
xtest_mean_price['Neighborhood']=xtest_mean_price['Neighborhood'].astype('int64')

# 학습데이터 주택 가격 (X_train, y_train)
train_mean_price = pd.DataFrame(train_mean_price.groupby('Neighborhood')['Price'].mean())
# 테스트데이터 주택 가격 (X_test, y_test)
xtest_mean_price = pd.DataFrame(xtest_mean_price.groupby('Neighborhood')['Price'].mean())
# 예측한 주택 가격 (X_test, y_pred)
pred_df = pd.DataFrame(predict.groupby('Neighborhood')['Price'].mean())

# 가격 차이 (y_train - y_pred = price_difference_1, y_test-y_pred =  price_difference_2)
price_difference_0 = []
price_difference_1 = []
price_difference_2 = []
for i in range(len(train_mean_price+1)):
    price0 = train_mean_price['Price'][i] - xtest_mean_price['Price'][i]
    price1 = train_mean_price['Price'][i] - pred_df['Price'][i]
    price2 = xtest_mean_price['Price'][i] - pred_df['Price'][i]
    price_difference_0.append(price0)
    price_difference_1.append(price1)
    price_difference_2.append(price2)
price_difference_0 = pd.DataFrame(price_difference_0)
price_difference_1 = pd.DataFrame(price_difference_1)
price_difference_2 = pd.DataFrame(price_difference_2)

# 하나의 데이터프레임으로 결과 비교해보기
result_df = pd.concat([train_mean_price, xtest_mean_price, pred_df, price_difference_0, price_difference_1, price_difference_2], axis=1).reset_index()
result_df.columns = ['Neighborhood', 'Train Price', 'Test Price', 'Predict Price', 'Train-Test', 'Train-Predict', 'Test-Predict']
result_df['Neighborhood']=result_df['Neighborhood'].replace({0: 'Rural', 1: 'Suburb', 2: 'Urban'})

In [None]:
result_df

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
import numpy as np

# 데이터 로드
data = load_iris()
X, y = data.data, data.target

# 모델 정의
model = RandomForestClassifier()

# k-폴드 교차 검증 설정 (5-폴드)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 교차 검증 실행
scores = cross_val_score(model, X, y, cv=kf)

# 결과 출력
print("교차 검증 점수:", scores)
print("평균 점수:", scores.mean())

In [None]:
import numpy as np

# 원본 데이터셋
data = np.random.normal(size=100)

# 부트스트랩 샘플링
bootstrap_samples = np.random.choice(data, size=(1000, len(data)), replace=True)
bootstrap_means = np.mean(bootstrap_samples, axis=1)

# 신뢰 구간 계산
lower_bound = np.percentile(bootstrap_means, 2.5)
upper_bound = np.percentile(bootstrap_means, 97.5)

print(f"평균의 95% 신뢰 구간: [{lower_bound}, {upper_bound}]")