In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, r2_score

from IPython.core.display import HTML 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'] 
df = pd.read_csv('/kaggle/input/boston-house-prices/housing.csv', header=None, delim_whitespace=True, names=names)

In [None]:
df.shape

In [None]:
display(HTML(df.head().to_html()))

## Préparation de la data

In [None]:
df.info()

In [None]:
df.count()

Pas de Nan

In [None]:
df.MEDV.value_counts()

In [None]:
df[df.MEDV<15].plot(kind='scatter', x='DIS',y='AGE',c='MEDV',cmap='rainbow',s=3,figsize=(12,12))

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
list_t=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT']

In [None]:
scaler = StandardScaler()
df[list_t] = scaler.fit_transform(df[list_t])
df.describe()

## Analyse des correlations

In [None]:
tabcorr = df.corr()
tabcorr

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(abs(tabcorr), cmap="coolwarm")

In [None]:
sns.clustermap(abs(tabcorr), cmap="coolwarm")

In [None]:
from scipy.cluster import hierarchy as hc

corr = 1 - df.corr()
corr_condensed = hc.distance.squareform(corr)
link = hc.linkage(corr_condensed, method='ward')
plt.figure(figsize=(12,12))
den = hc.dendrogram(link, labels=df.columns, orientation='left', leaf_font_size=10)

In [None]:
correlations = tabcorr.MEDV
print(correlations)

In [None]:
correlations = correlations.drop(['MEDV'],axis=0)

In [None]:
print(abs(correlations).sort_values(ascending=False))

## Splitting

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(['MEDV'], axis=1)
y = df['MEDV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Construction de modele de regression

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)            
y_pred = lm.predict(X_test)
print(lm.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("Prix")
plt.ylabel("Prediction de prix")
plt.title("Prix reels vs predictions")

In [None]:
sns.distplot(y_test-y_pred)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))

### Random Forest Regressor

In [None]:
X = df.drop(['MEDV'], axis=1)
y = df.MEDV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
print(rf.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_rf)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("Prix")
plt.ylabel("Prediction de prix")
plt.title("Prix reels vs predictions")

In [None]:
sns.distplot(y_test-y_rf)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_rf)))

### XGB Regressor

In [None]:
import xgboost as XGB
xgb  = XGB.XGBRegressor()
xgb.fit(X_train, y_train)
y_xgb = xgb.predict(X_test)
print(xgb.score(X_test,y_test))

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(y_test, y_xgb)
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()], color='red', linewidth=3)
plt.xlabel("Prix")
plt.ylabel("Prediction de prix")
plt.title("Prix reels vs predictions")

In [None]:
sns.distplot(y_test-y_xgb)

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_xgb)))