## Implementations

In [0]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

print(sns.__version__)

# Remainder for barplot

In [0]:
data = pandas.read_csv('HousePrices.csv')
data.head()

In [0]:
sns.barplot(x=data['Garage'].values, y=data['Prices'].values)

In [0]:
sns.boxplot(x=data['Garage'].values, y=data['Prices'].values)

In [0]:
sns.scatterplot(x=data['Garage'].values, y=data['Prices'].values)

In [0]:
X = np.arange(1, 100)
y = X**2 + 3

data = pandas.DataFrame(zip(X, y), columns=['X', 'y'])
print(data.head())

In [0]:
sns.lineplot(x='X', y='y', data=data)

In [0]:
sns.scatterplot(x='X', y='y', data=data)

In [0]:
sns.barplot(x='X', y='y', data=data)

# Can we teach a computer addition?

In [0]:
X1 = np.arange(1, 1000)
X2 = np.arange(1, 1000)
y = X1 + X2

data = pandas.DataFrame(zip(X1, X2, y), columns=['X1', 'X2', 'y'])
print(data.head())

# X = data[['X1', 'X2']].values.reshape(-1, 1)
X = data[['X1', 'X2']].values
y = data['y'].values

In [0]:
sns.lineplot(x='X1', y='y', data=data)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [0]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

In [0]:
reg.coef_

## _How we can evaluate our model? Is accuracy suitable?_

In [0]:
from sklearn.metrics import mean_absolute_error

predictions = reg.predict(X_test)
print(f'MAE Score: {mean_absolute_error(predictions, y_test)}')

In [0]:
examples = [[5,7], [2,2], [127, 20]]
reg.predict(examples)

## _What if we train linear regression model on non-linear function?_

In [0]:
non_linear_X1 = np.arange(1, 1000)
non_linear_X2 = np.arange(1, 1000)
non_linear_y = non_linear_X1 * non_linear_X2

non_linear_data = pandas.DataFrame(zip(non_linear_X1, non_linear_X2, non_linear_y), columns=['X1', 'X2', 'y'])
non_linear_X = non_linear_data[['X1', 'X2']].values
non_linear_y = non_linear_data['y'].values

(non_linear_X_train, non_linear_X_test, 
 non_linear_y_train, non_linear_y_test) = train_test_split(non_linear_X, non_linear_y, test_size=0.2)

sns.lineplot(x='X1', y='y', data=non_linear_data)

In [0]:
non_linear_reg = LinearRegression()
non_linear_reg.fit(non_linear_X_train, non_linear_y_train)

non_linear_predictions = non_linear_reg.predict(X_test)
print(f'MAE Score of non_linear_regression model: {mean_absolute_error(non_linear_predictions, non_linear_y_test)}')

In [0]:
temp_X1 = np.arange(1, 1000)
temp_X2 = np.arange(1, 1000)
temp_y = non_linear_reg.coef_[0] * temp_X1 + non_linear_reg.coef_[1] * temp_X2

sns.lineplot(x='X1', y='y', data=non_linear_data, legend='brief', label='groundtruth')
sns.lineplot(x=temp_X1, y=temp_y, legend='brief', label='prediction')

# Boston House Prices

In [0]:
from sklearn.datasets import load_boston

boston_dataset = load_boston()

In [0]:
data = boston_dataset.data
target = boston_dataset.target
print(boston_dataset.DESCR)

In [0]:
data = pandas.DataFrame(data, columns=boston_dataset.feature_names)
data['PRICES'] = boston_dataset.target
data.head()

In [0]:
data.describe()

In [0]:
sns.scatterplot(data['LSTAT'], data['PRICES'])
plt.xlabel('% lower status of the population')
plt.ylabel('PRICES')

In [0]:
sns.scatterplot(data['DIS'], data['PRICES'])
plt.xlabel('weighted distances to five Boston employment centres')
plt.ylabel('Prices')

In [0]:
sns.scatterplot(data['RM'], data['PRICES'])
plt.ylabel('Prices')

In [0]:
sns.scatterplot(data['CRIM'], data['PRICES'])
plt.ylabel('Prices')

In [0]:
sns.scatterplot(data['PTRATIO'], data['PRICES'])

In [0]:
sns.distplot(data['PTRATIO'])
plt.xlabel('pupil-teacher ratio by town')

In [0]:
sns.heatmap(data.corr())

In [0]:
features = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
            'TAX', 'PTRATIO', 'B', 'LSTAT']
label = ['PRICES']
X = data[features].values
y = data[label].values

## Regression Model

In [0]:
from sklearn.model_selection import train_test_split

# 80% Train, 10% Validation, %10 Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

print(f'Total # of sample in whole dataset: {len(X)}')
print(f'Total # of sample in train dataset: {len(X_train)}')
print(f'Total # of sample in validation dataset: {len(X_valid)}')
print(f'Total # of sample in test dataset: {len(X_test)}')

In [0]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)

In [0]:
from sklearn.metrics import mean_absolute_error

predictions = reg.predict(X_valid)
print(f'MAE Score: {mean_absolute_error(predictions, y_valid)}')

## Can we improve the results?

> Normalization

$X_{norm} = \frac{X - X_{min}}{X_{max} - X_{min}}$

$X_{norm} = \frac{X}{max(X)}$ (Max)

$X_{norm} = \frac{X}{\sum_{i}{|X_i|}}$ (L1 Norm)

$X_{norm} = \frac{X}{\sqrt{\sum_{i}{X_i^2}}}$ (L2 Norm)





In [0]:
from sklearn.preprocessing import normalize

# X_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# X_norm = normalize(X, norm='max')

# X_norm = X / np.sqrt((X*X).sum(axis=1))[:, np.newaxis] # L2 Norm
X_norm = normalize(X, norm='l2')

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42) 
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [0]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [0]:
from sklearn.metrics import mean_absolute_error

predictions = reg.predict(X_valid)
print(f'MAE Score: {mean_absolute_error(predictions, y_valid)}')

### Evaluating model on test dataset

In [0]:
predictions = reg.predict(X_test)
print(f'Test (Final) MAE Score: {mean_absolute_error(predictions, y_valid)}')