![](https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/LinePurplePink.jpg?raw=true)

# Import Libraries
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/upload-icon.png?raw=true" width="100"/>

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.gridspec import GridSpec
import seaborn as sns
from scipy import stats

from sklearn. impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
import sklearn.metrics as metrics

sns.set()
style.use('fivethirtyeight')
pd.options.mode.chained_assignment = None  # default='warn'

# Loading Dataset
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/add-to-database.png?raw=true" width="100"/>

In [5]:
data = pd.read_csv('../input/housedata/data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../input/housedata/data.csv'

# EDA + FE: Exploratory Data Analysis and Feature Engineering
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/data%20analysis.png?raw=true" width="100"/>

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.nunique()

### We can see that this dataset doesn't have NaN values

In [None]:
data.info()

In [None]:
data.describe().T

In [None]:
plt.figure(figsize=(17, 15))
corr_data = data.drop(['date'], axis=1)

corr_mask = np.triu(corr_data.corr())
h_map = sns.heatmap(corr_data.corr(), mask=corr_mask, cmap='Blues')
h_map

### Price
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/price.png?raw=true" width="100"/>

#### Price-Column has got outliers

In [None]:
sns.histplot(data.price)

#### Let's delete it !

In [None]:
def remove_outliers(data, x):
    q25 = np.percentile(data[x], 25)
    q75 = np.percentile(data[x], 75)
    iqr = q75 - q25
    cut_off = iqr * 1.5
    lower, upper = 1, (q75 + cut_off)
    data = data[(data[x] < upper) & (data[x] > lower)]
    print(f"Outliers of {x} are removed\n")
    return data

In [None]:
data = remove_outliers(data, 'price')

In [None]:
fig = plt.figure(figsize=(17, 15))
grid = GridSpec(ncols=1, nrows=2, figure=fig)

# Histogram
ax1 = fig.add_subplot(grid[0, :])
sns.histplot(data['price'], ax=ax1, kde=True)

# QQ plot
ax2 = fig.add_subplot(grid[1, :])
stats.probplot(data['price'], plot=ax2)

In [None]:
y = np.array(data.price)
plt.figure(figsize=(20, 6))
plt.subplot(131)
plt.plot(range(len(y)), y, '.')
plt.ylabel('price')
plt.xlabel('index')

In [None]:
data.shape


### Date
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/date.png?raw=true" width="100"/>

In [None]:
data.date = pd.to_datetime(data.date)

In [None]:
data.date.dt.year.value_counts()

In [None]:
fig = plt.figure(figsize=(17, 15))
grid = GridSpec(ncols=1, nrows=2, figure=fig)

ax1 = fig.add_subplot(grid[0, :])
sns.countplot(x=data.date.dt.month, ax=ax1)

ax2 = fig.add_subplot(grid[1, :])
sns.boxplot(x=data.date.dt.month, y='price', data=data, ax=ax2)

In [None]:
fig = plt.figure(figsize=(15, 10))
sns.countplot(data.date.dt.day)

In [None]:

sns.catplot(x=data.date.dt.day.values, y='price', data=data, height=5, aspect=2)

#### Date-column have to be deleted

In [None]:
data.drop('date', axis=1, inplace=True)

#### Checking:

In [None]:
data.head(3)

### Bedrooms
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/bedroom.png?raw=true" width="100"/>

In [None]:
sns.catplot(x='bedrooms', y='price', data=data, height=5, aspect=2)

In [None]:
fig = plt.figure(figsize=(12, 8))
sns.histplot(data.bedrooms)

#### We can see outliers, when the value >= 7. Let's delete this one

In [None]:
data = data[data.bedrooms < 7]

In [None]:
fig = plt.figure(figsize=(12, 8))
sns.histplot(x=data.bedrooms)
plt.ylabel('Amount')

In [None]:
data.shape

### Bathrooms
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/bathroom.png?raw=true" width="100"/>

In [None]:
fig = plt.figure(figsize=(12, 8))
sns.countplot(x=data['bathrooms'])

#### This attribute has outliers

In [None]:
sns.catplot(x='bathrooms', y='price', data=data, height=5, aspect=2)

In [None]:
mask = (data.bathrooms < 3.75)
data = data[mask]
data.bathrooms = data.bathrooms.apply(lambda x: 1 if (x == 0.75)
                                                      or
                                                     (x == 1.25)
                                                  else x)

In [None]:
# Pretty good !
sns.catplot(x='bathrooms', y='price', data=data, height=5, aspect=2)

In [None]:
data.shape

### Square footage
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/floor-plan.png?raw=true" width="100"/>

In [None]:
sqft_cols = data.columns[data.columns.str.contains('sqft')]
sqft_cols

In [None]:
def show_count_plots(data, cols):
    fig = plt.figure(figsize=(10, 10))
    grid = GridSpec(ncols=2,
                    nrows=2, figure=fig)
    for i, name in enumerate(cols):
        ax = fig.add_subplot(grid[i])
        sns.histplot(data[name], kde=True, ax=ax)

In [None]:
show_count_plots(data, sqft_cols)

### Floors
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/floors.png?raw=true" width="100"/>

In [None]:
sns.catplot(x='floors', y='price', data=data, height=5, aspect=2)

In [None]:
mask_floors = {1: 1, 1.5:1, 2:2, 2.5:3, 3:3, 3.5:3}
data.floors = data.floors.map(mask_floors)

In [None]:
# Excellent !
sns.catplot(x='floors', y='price', data=data, height=5, aspect=2)

#### Let's check the attributes again

In [None]:
data.head(3)

### WaterFront
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/water.png?raw=true" width="100"/>

In [None]:
print(data.waterfront.value_counts())
sns.catplot(x='waterfront', y='price', data=data,
            kind='box', height=5, aspect=2)

### View
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/view-file.png?raw=true" width="100"/>

In [None]:
print(data.view.value_counts())
sns.catplot(x='view', y='price', data=data,
            kind='boxen', height=5, aspect=2)

### Condition
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/condition.png?raw=true" width="100"/>

In [None]:
print(data.condition.value_counts())

fig = plt.figure(figsize=(15, 10))

ax1 = fig.add_subplot(121)
sns.boxplot(x='condition', y='price', data=data, ax=ax1)

ax2 = fig.add_subplot(122)
sns.stripplot(x='condition', y='price', data=data, ax=ax2)

#### Delete column with value = 1

In [None]:
mask_cond = data.condition > 1
data = data[mask_cond]

In [None]:
fig = plt.figure(figsize=(12, 8))
sns.boxplot(x='condition', y='price', data=data)

### Years: built and renovated
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/years.png?raw=true" width="100"/>

In [None]:
sns.histplot(data.yr_built, kde=True)

In [None]:
sns.histplot(data.yr_renovated, kde=True)

#### If house is renovated, set the value is equal 1. Else is equal 0

In [None]:
data.yr_renovated = data.yr_renovated.apply(lambda x: 0 if x==0 else 1)

In [None]:
print(data.yr_renovated.value_counts())
sns.catplot(x='yr_renovated', y='price', data=data,
            kind='bar', height=5, aspect=2)

#### Checking the data

In [None]:
data.head(3)

### Location information
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/location.png?raw=true" width="100"/>

#### Street

In [None]:
data.street.value_counts()

#### This will prevent finding the common pattern

In [None]:
data.drop('street', axis=1, inplace=True)

#### City

In [None]:
data.city.value_counts().head(10)

In [None]:
fig = plt.figure(figsize=(12, 8))
# Top 5 cities
data.city.value_counts().head(5).plot.pie()

#### State zip

In [None]:
print(data.statezip.nunique())
print(data.statezip.value_counts())

In [None]:
data.drop('statezip', axis=1, inplace=True)

#### Country

In [None]:
sns.countplot(x=data.country)

In [None]:
data.drop('country', axis=1, inplace=True)

### Checking the dataset
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/checking.png?raw=true" width="100"/>

In [None]:
h_map = sns.heatmap(corr_data.corr(), mask=corr_mask, cmap='Blues')
h_map

In [None]:
print(data.shape)
data.head(3)

# Prepare data for training
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/prepare.png?raw=true" width="100"/>

In [None]:
X = data.drop('price', axis=1)
y = data.price.astype(int)

### Scaler

In [None]:
# set up preprocessing numeric columns
imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()

In [None]:
# set up preprocessing categorical columns
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(handle_unknown='ignore')

In [None]:
# select columns by datatype
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

In [None]:
# do all preprocessing
preprocessor = make_column_transformer(
    (make_pipeline(imp_median, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols)
)

# Model training and visualising
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/external-training-lineal-.png?raw=true" width="100"/>

In [None]:
# create a pipeline
def pipeline_model(X, y, data):
    pipe = make_pipeline(preprocessor, LinearRegression())

    print(cross_val_score(pipe, X, y).mean())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=32)

    pipe.fit(X_train, y_train)

    predict = pipe.predict(X_test)
    print(f'MSE             : {metrics.mean_squared_error(y_test, predict)}\n'
          f'\nRMSE            : '
          f'{np.sqrt(metrics.mean_squared_error(y_test, predict))}\n'
          f'MAE             : {metrics.mean_absolute_error(y_test, predict)}\n'
          #f'Mean abs % error: '
          #f'{metrics.mean_absolute_percentage_error(y_test, predict)}\n'
          f'\n'
          f'Score (train)   : {pipe.score(X_train, y_train)}\n'
          f'Score (test)    : {pipe.score(X_test, y_test)}\n')
    comparison_of_results(X_test, y_test, pipe)
    visualize_model_results(data, pipe)

def comparison_of_results(X_test, y_test, model, times=5):
    for i in range(times):
        rnd = np.random.randint(0, y_test.shape[0] - 1)
        real = y_test.iloc[rnd]
        pred = int(model.predict(X_test.iloc[rnd].to_frame().T)[0])
        print(f'Real Value      ----->>>>> {real} $\n'
              f'Predicted Value ----->>>>> {pred} $')
        print()


def visualize_model_results(data, model):
    fig = plt.figure(figsize=(17, 10))
    data = data.sort_values(by=['price'])
    X = data.drop('price', axis=1)
    y = data.price.astype(int)
    
    plt.scatter(range(X.shape[0]), y, color='red', label='Real')
    plt.scatter(range(X.shape[0]), model.predict(X), marker='.', label='Predict')

    plt.legend(loc=2, prop={'size': 25})

In [None]:
pipeline_model(X, y, data)

# Conclusions
<img height="100" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/conclusion.png?raw=true" width="100"/>

## <center> Random Forest Model showed a maximum ~72% (usually ~69) </center>


# <center> Thank You ! </center>

<img height="150" src="https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/icons/thank-you.png?raw=true" width="150"/>

## <center> If you have found something useful for yourself in my work , please evaluate and comment </center>

![](https://github.com/GeorgeGalaxy/PhotosFor.../blob/main/LinePurplePink.jpg?raw=true)