# Boston Challenge - Regression One

## Part One - Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv("data.csv")

In [2]:
data.head()

Unnamed: 0,TOWN,TOWNNO,TRACT,LON,LAT,MEDV,CMEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,Nahant,0,2011,-70.955,42.255,24.0,24.0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,Swampscott,1,2021,-70.95,42.2875,21.6,21.6,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,Swampscott,1,2022,-70.936,42.283,34.7,34.7,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,Marblehead,2,2031,-70.928,42.293,33.4,33.4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,Marblehead,2,2032,-70.922,42.298,36.2,36.2,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [3]:
data.describe() # count is complete, no missing data

Unnamed: 0,TOWNNO,TRACT,LON,LAT,MEDV,CMEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,47.531621,2700.357708,-71.056389,42.21644,22.532806,22.528854,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,27.571401,1380.03811,0.075405,0.061777,9.197104,9.182176,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.0,1.0,-71.2895,42.03,5.0,5.0,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,26.25,1303.25,-71.093225,42.180775,17.025,17.025,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,42.0,3393.5,-71.0529,42.2181,21.2,21.2,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,78.0,3739.75,-71.019625,42.25225,25.0,25.0,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,91.0,5082.0,-70.81,42.381,50.0,50.0,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


## Part Two - Feature Engineering

In [4]:
columns = list(data.columns)
unused_columns = ["TOWN", "TRACT", "LON", "LAT", "MEDV"]
for col in unused_columns: columns.remove(col)
data = data[columns]

In [5]:
data["CMEDV"].head() # in USD 1000

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: CMEDV, dtype: float64

In [6]:
data["CMEDV"].describe()

count    506.000000
mean      22.528854
std        9.182176
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: CMEDV, dtype: float64

In [7]:
data.head() # all numeric data ready for t-SNE

Unnamed: 0,TOWNNO,CMEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0,24.0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,1,21.6,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,1,34.7,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,2,33.4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,2,36.2,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [8]:
from sklearn.manifold import TSNE
# TODO: perform either TSNE or PCA on the data set

## Part Three - Correlation

In [9]:
from pandas.plotting import scatter_matrix
import os

# all pair plots
if not os.path.exists("out/correlation.jpeg"):
    scatter_matrix(data, figsize=(30, 20))
    plt.savefig("out/correlation.jpeg")

In [10]:
# pair plots with price
for col in data.columns:
    if col != "CMEDV":
        if not os.path.exists(f"out/CMEDV_{col}_correlation.jpeg"):
            scatter_matrix(data[["CMEDV",col]], alpha=0.3)
            plt.savefig(f"out/CMEDV_{col}_correlation.jpeg")

In [11]:
# correlation heatmaps
if not os.path.exists("out/heatmap.jpeg"):
    fig, ax = plt.subplots(figsize=(15, 10))
    sns.heatmap(data.corr(), annot=True, fmt=".2f")
    plt.savefig("out/heatmap.jpeg")

## Part Four - Regression