In [21]:

import numpy as np
import pandas as pd
import seaborn as sns
import io
import requests
import re
import warnings
import os
print(os.listdir("../input"))
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates

from scipy import stats
from scipy.stats import norm,skew

import torch
from torch import nn
from d2l import torch as d2l

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-notebook')
from matplotlib.ticker import StrMethodFormatter
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
#I used plotly and sns for the visuals
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### 查看数据

In [None]:
train_data = pd.read_csv("/kaggle/input/kagglefirstcompetion/train.csv")
test_data = pd.read_csv("/kaggle/input/kagglefirstcompetion/test.csv")
train_data.head()
# test_data.head()

In [None]:
train_data.shape, test_data.shape

In [None]:
# Print train and test columns
print('Train columns:', train_data.columns.tolist())
print("------"*20)
print('Test columns:', test_data.columns.tolist())

### 将特征分为数字特征和字符特征

In [None]:
numeric_features = train_data.dtypes[train_data.dtypes != 'object'].index
obj_features = train_data.dtypes[train_data.dtypes == 'object'].index

train_num = train_data[numeric_features]
train_obj = train_data[obj_features]
# train_num
train_obj

### 数字特征处理

In [None]:
print(train_num.shape)
print("------"*20)
print(train_num.columns)
print("------"*20)
print(train_num.info())
print("------"*20)
print(train_num.describe())
print("------"*20)

In [None]:
for numeric_feature in numeric_features:
    print(train_data[numeric_feature].value_counts())
    print("------"*20)

### CORRELATION - NORMALITY - HOMOGENEITY 相关系数

In [None]:
corrPearson = train_data.corr(method="pearson")    # 两种相关系数定义方法
corrSpearman = train_data.corr(method="spearman")

### PEARSON CORRELATION

In [None]:
figure = plt.figure(figsize=(30,25))
sns.heatmap(corrPearson,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("PEARSON")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

### SPEARMAN CORRELATION

In [None]:
figure = plt.figure(figsize=(30,25))
sns.heatmap(corrSpearman,annot=True,cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title("SPEARMAN")
plt.xlabel("COLUMNS")
plt.ylabel("COLUMNS")
plt.show()

### CONCLUSION 结论

对于数值特征，Bathrooms, Full bathrooms, Tax assessed value, Annual tax amount, Listed Price, Last Sold Price具有较强正相关性;
Elementary School Distance, Middle School Distance, High School Distance具有较强负相关性。 

### Outliers 异常值处理

In [None]:
main_num_features = ['Bathrooms', 'Full bathrooms', 'Tax assessed value', 'Annual tax amount', 
                 'Listed Price', 'Last Sold Price']

for main_num_feature in main_num_features:
    print(train_data[main_num_feature].value_counts())
    print("------"*20)

### Tax assessed value

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train_data['Tax assessed value'], y = train_data['Sold Price'])
plt.ylabel('Sold Price', fontsize=13)
plt.xlabel('Tax assessed value', fontsize=13)
plt.show()

In [None]:
#Deleting outliers
train_data = train_data.drop(train_data[(train_data['Tax assessed value']>4 * 10000000) | (train_data['Sold Price']>5 * 10000000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train_data['Tax assessed value'], train_data['Sold Price'])
plt.ylabel('Tax assessed value', fontsize=13)
plt.xlabel('Sold Price', fontsize=13)
plt.show()

### Listed Price

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train_data['Listed Price'], y = train_data['Sold Price'])
plt.ylabel('Sold Price', fontsize=13)
plt.xlabel('Listed Price', fontsize=13)
plt.show()

In [None]:
#Deleting outliers
train_data = train_data.drop(train_data[(train_data['Listed Price']>2 * 100000000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train_data['Listed Price'], train_data['Sold Price'])
plt.ylabel('Sold Price', fontsize=13)
plt.xlabel('Listed Price', fontsize=13)
plt.show()

### Last Sold Price

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = train_data['Last Sold Price'], y = train_data['Sold Price'])
plt.ylabel('Sold Price', fontsize=13)
plt.xlabel('Last Sold Price', fontsize=13)
plt.show()

In [None]:
#Deleting outliers
train_data = train_data.drop(train_data[(train_data['Last Sold Price']>3.5 * 10000000)].index)

#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train_data['Last Sold Price'], train_data['Sold Price'])
plt.ylabel('Sold Price', fontsize=13)
plt.xlabel('Last Sold Price', fontsize=13)
plt.show()

### Target Variable 目标变量处理

In [None]:
# from scipy import stats
# from scipy.stats import norm,skew

# sns.distplot(train_data['Sold Price'] , fit=norm);

# # Get the fitted parameters used by the function
# (mu, sigma) = norm.fit(train_data['Sold Price'])
# print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# #Now plot the distribution
# plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
#             loc='best')
# plt.ylabel('Frequency')
# plt.title('SalePrice distribution')

# #Get also the QQ-plot
# fig = plt.figure()
# res = stats.probplot(train_data['Sold Price'], plot=plt)
# plt.show()

### 进行对数化处理

In [None]:
# #We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
# train_data["Sold Price"] = np.log1p(train_data["Sold Price"])

# #Check the new distribution 
# sns.distplot(train_data['Sold Price'] , fit=norm);

# # Get the fitted parameters used by the function
# (mu, sigma) = norm.fit(train_data['Sold Price'])
# print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

# #Now plot the distribution
# plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
#             loc='best')
# plt.ylabel('Frequency')
# plt.title('SalePrice distribution')

# #Get also the QQ-plot
# fig = plt.figure()
# res = stats.probplot(train_data['Sold Price'], plot=plt)
# plt.show()

### 离散特征处理

In [None]:
print(train_obj.shape)
print("------"*20)
print(train_obj.columns)
print("------"*20)
print(train_obj.info())
print("------"*20)
print(train_obj.describe())
print("------"*20)

In [None]:
for obj_feature in obj_features:
    print(train_data[obj_feature].value_counts())
    print("------"*20)

In [None]:
main_obj_features = ['Type', 'Cooling', 'Bedrooms', 'Region', 
                     'Cooling features', 'City', 'State']

for main_obj_feature in main_obj_features:
    print(train_data[main_obj_feature].value_counts())
    print("------"*20)

## 特征工程

### 合并训练集与测试集

In [None]:
# ntrain = train_data.shape[0]
# ntest = test_data.shape[0]
# y_train = train_data['Sold Price'].values
# train_data_num = train_data[main_num_features]
# test_data_num = test_data[main_num_features]
# all_data_num = pd.concat((train_data_num, test_data_num)).reset_index(drop=True)
# # all_data_num.drop(['Sold Price'], axis=1, inplace=True)
# print("all_data_num size is : {}".format(all_data_num.shape))

In [None]:
ntrain = train_data.shape[0]
ntest = test_data.shape[0]
y_train = train_data['Sold Price'].values
all_features = main_num_features + main_obj_features

train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1),
                               dtype=torch.float32)

train_data1 = train_data[all_features]
test_data1 = test_data[all_features]
all_data = pd.concat((train_data1, test_data1)).reset_index(drop=True)
# all_data.drop(['Sold Price'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

## 空值填充

In [None]:
# 对于数值特征，用均值替代空值
all_data[main_num_features] = all_data[main_num_features].fillna(all_data[main_num_features].mean())

### One-Hot

In [None]:
# 对于字符特征，使用独热编码
all_data = pd.get_dummies(all_data, dummy_na=True)
all_data.shape

### Missing Data 空值处理

In [None]:
# all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
# all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
# missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
# missing_data.head(50)

In [None]:
# f, ax = plt.subplots(figsize=(15, 12))
# plt.xticks(rotation='90')
# sns.barplot(x=all_data_na.index, y=all_data_na)
# plt.xlabel('Features', fontsize=15)
# plt.ylabel('Percent of missing values', fontsize=15)
# plt.title('Percent missing data by feature', fontsize=15)

#### Tax assessed value

In [None]:
# all_data['Tax assessed value'] = all_data['Tax assessed value'].fillna(all_data['Tax assessed value'].mean())

#### High School Distance

In [None]:
# all_data['High School Distance'] = all_data['High School Distance'].fillna(all_data['High School Distance'].mean())

#### Annual tax amount

In [None]:
# all_data['Annual tax amount'] = all_data['Annual tax amount'].fillna(all_data['Annual tax amount'].mean())

#### Elementary School Distance

In [None]:
# all_data['Elementary School Distance'] = all_data['Elementary School Distance'].fillna(all_data['Elementary School Distance'].mean())

#### Full bathrooms

In [None]:
# all_data['Full bathrooms'] = all_data['Full bathrooms'].fillna(all_data['Full bathrooms'].mean())

#### Middle School Distance

In [None]:
# all_data['Middle School Distance'] = all_data['Middle School Distance'].fillna(all_data['Middle School Distance'].mean())

#### Last Sold Price

In [None]:
# all_data['Last Sold Price'] = all_data['Last Sold Price'].fillna(all_data['Last Sold Price'].mean())

#### Listed Price

In [None]:
# all_data['Listed Price'] = all_data['Listed Price'].fillna(all_data['Listed Price'].mean())

#### Bathrooms

In [None]:
# all_data['Bathrooms'] = all_data['Bathrooms'].fillna(all_data['Bathrooms'].mean())

In [None]:
# main_num_features = ['Bathrooms', 'Full bathrooms', 'Tax assessed value', 'Annual tax amount', 
#                  'Listed Price', 'Last Sold Price', 'Elementary School Distance', 'Middle School Distance', 
#                  'High School Distance']

In [None]:
# main_obj_features = ['Type', 'City', 'Region']

In [None]:
# features = main_num_features + main_obj_features
# features

In [None]:
# all_data = all_data[features]

In [None]:
# all_data = pd.get_dummies(all_data, dummy_na=True)
# all_data.shape

In [None]:
n_train = train_data.shape[0]

train_features = torch.tensor(all_data[:n_train].values,
                                dtype=torch.float32)
test_features = torch.tensor(all_data[n_train:].values,
                                dtype=torch.float32)

### 定义网络模型

In [None]:
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features, 64),
                        nn.ReLU(),
                        nn.Linear(64, 1))
    return net

### 定义损失函数

In [None]:
loss = nn.MSELoss()

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

### 定义训练函数

In [None]:
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

### K折交叉验证

In [None]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [None]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'折{i + 1}，训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

### 模型选择（确定超参数）

In [None]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 0.01, 0.001, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

In [None]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 0.01, 0.001, 64
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse：{float(train_ls[-1]):f}')
    # 将网络应用于测试集。
    preds = net(test_features).detach().numpy()
    preds = pd.Series(preds.reshape(1,-1)[0])
    # 将其重新格式化以导出到Kaggle
#     test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
#     submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
#     submission.to_csv('submission.csv', index=False)
    return preds

In [None]:
# sub_data = pd.read_csv('/kaggle/input/kagglefirstcompetion/sample_submission.csv')

preds_1 = train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

In [None]:
# !pip install --upgrade pandas
# import pandas as pd

test_data['Sold Price'] = preds_1
submission1 = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission1.to_csv('submission.csv', index=False)
print('submission is saved!')

In [None]:
# train_and_pred(train_features, test_features, train_labels, test_data,
#                num_epochs, lr, weight_decay, batch_size)