In [4]:
import pandas as pd
import csv
import os

In [5]:
def create_df(data_path):
    # input: the path of the csv file
    csvdata = os.path.join(data_path)
    df = pd.read_csv(csvdata)
    # output: data frame
    return df
df = create_df("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
def nan_columns(df):
    # input: data frame
    nancolumns = df.columns[df.isnull().any()].tolist()
    # output: a list of names of columns that contain nan values in the data frame
    return nancolumns
nancolumns = nan_columns(df)
print nancolumns

['total_bedrooms']


In [7]:
def categorical_columns(df):
    # input: data frame
    # output: a list of column names that contain categorical values in the data frame
    catcolumns = df.select_dtypes(include=['object'])
    return catcolumns
catcolumns = categorical_columns(df)
print catcolumns

      ocean_proximity
0            NEAR BAY
1            NEAR BAY
2            NEAR BAY
3            NEAR BAY
4            NEAR BAY
5            NEAR BAY
6            NEAR BAY
7            NEAR BAY
8            NEAR BAY
9            NEAR BAY
10           NEAR BAY
11           NEAR BAY
12           NEAR BAY
13           NEAR BAY
14           NEAR BAY
15           NEAR BAY
16           NEAR BAY
17           NEAR BAY
18           NEAR BAY
19           NEAR BAY
20           NEAR BAY
21           NEAR BAY
22           NEAR BAY
23           NEAR BAY
24           NEAR BAY
25           NEAR BAY
26           NEAR BAY
27           NEAR BAY
28           NEAR BAY
29           NEAR BAY
...               ...
20610          INLAND
20611          INLAND
20612          INLAND
20613          INLAND
20614          INLAND
20615          INLAND
20616          INLAND
20617          INLAND
20618          INLAND
20619          INLAND
20620          INLAND
20621          INLAND
20622          INLAND
20623     

In [8]:
def replace_missing_features(df, nancolumns):
    # input: data frame, list of column names that contain nan values
        median = df[nancolumns].median()
        new_df1 = df.fillna(median)
    # output: data frame
        return new_df1

new_df1 = replace_missing_features(df, nancolumns)
print new_df1

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
5        -122.25     37.85                52.0        919.0           213.0   
6        -122.25     37.84                52.0       2535.0           489.0   
7        -122.25     37.84                52.0       3104.0           687.0   
8        -122.26     37.84                42.0       2555.0           665.0   
9        -122.25     37.84                52.0       3549.0           707.0   
10       -122.26     37.85                52.0       2202.0           434.0   
11       -122.26     37.85                52.0      

In [9]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer

def cat_to_num(new_df1, catcolumns):
    #input: data frame, list of categorical feature column names
    encoder = LabelEncoder()
    encodedDf = encoder.fit_transform(catcolumns)
    #print(encoder.classes_)
    oneHotEncoder = OneHotEncoder()
    oneHotDf = oneHotEncoder.fit_transform(encodedDf.reshape(-1,1)).toarray()
    #oneHotDf = pd.DataFrame(oneHotEncoder.fit_transform(encodedDf.to_dict(outtype='records')).toarray())
    one_hot = pd.get_dummies(catcolumns)
    new_df1 = new_df1.drop(catcolumns, axis = 1)
    new_df2 = new_df1.join(one_hot)
    return new_df2
new_df2 = cat_to_num(new_df1, catcolumns)
print new_df2

  y = column_or_1d(y, warn=True)


       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
5        -122.25     37.85                52.0        919.0           213.0   
6        -122.25     37.84                52.0       2535.0           489.0   
7        -122.25     37.84                52.0       3104.0           687.0   
8        -122.26     37.84                42.0       2555.0           665.0   
9        -122.25     37.84                52.0       3549.0           707.0   
10       -122.26     37.85                52.0       2202.0           434.0   
11       -122.26     37.85                52.0      

In [10]:
import numpy as np
from sklearn.preprocessing import StandardScaler
def standardization(new_df2, labelcol):
    # input: data frame and name of the label column
    new_df3 = new_df2.copy() #copy for create new_df3
    standardizer = StandardScaler()  
    data_array = standardizer.fit_transform(new_df3.loc[:,new_df3.columns!=labelcol])  #make standardize dataframe as array except labelcol
    #change to scaled array to dataframe
    new_df3 = pd.DataFrame(np.column_stack((new_df2[labelcol],data_array)),columns = new_df3.columns).set_index(new_df3.index)
    # output: scaled data frame
    new_df3[labelcol] = new_df2[labelcol].values
    return new_df3
new_df3 = standardization(new_df2, 'longitude')
print new_df3

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23  1.052548            0.982143    -0.804819       -0.972476   
1        -122.22  1.043185           -0.607019     2.045890        1.357143   
2        -122.24  1.038503            1.856182    -0.535746       -0.827024   
3        -122.25  1.038503            1.856182    -0.624215       -0.719723   
4        -122.25  1.038503            1.856182    -0.462404       -0.612423   
5        -122.25  1.038503            1.856182    -0.786942       -0.772182   
6        -122.25  1.033821            1.856182    -0.046188       -0.114070   
7        -122.25  1.033821            1.856182     0.214634        0.358054   
8        -122.26  1.033821            1.061601    -0.037021        0.305595   
9        -122.25  1.033821            1.856182     0.418616        0.405743   
10       -122.26  1.038503            1.856182    -0.198831       -0.245215   
11       -122.26  1.038503            1.856182     0

In [11]:
from sklearn.model_selection import train_test_split
import numpy as np

def my_train_test_split(new_df3, labelcol, test_ratio):
    global X_train
    global y_train
    # input: data frame, name of the label column and test data percentage
    # output: X_train, X_test, y_train, y_test  
    data = new_df3
    labels = new_df3[labelcol]
    X_train, X_test, y_train, y_test = train_test_split(data, labels,test_size=test_ratio)
    np.random.seed(0) # DON'T ERASE THIS LINE
    

    X_train = X_train.values 
    X_test = X_test.values
    y_train = y_train.values 
    y_test= y_test.values
    print(len(X_train), "train +", len(X_test), "test")
    return X_train,X_test,y_train, y_test
print my_train_test_split(new_df3,'longitude',0.4)

(12384, 'train +', 8256, 'test')
(array([[ -1.18410000e+02,  -7.82743070e-01,   2.67019951e-01, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       [ -1.21830000e+02,   1.91401157e+00,   1.85618152e+00, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       [ -1.19870000e+02,   5.51588654e-01,  -1.79889009e+00, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       ..., 
       [ -1.20740000e+02,   1.96083023e+00,  -1.56051586e+00, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       [ -1.21300000e+02,   1.45987060e+00,   5.84852265e-01, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       [ -1.22430000e+02,   9.77638432e-01,   1.61780729e+00, ...,
         -1.55662057e-02,   2.83074203e+00,  -3.84466489e-01]]), array([[ -1.22470000e+02,   9.96365895e-01,   1.53834921e+00, ...,
         -1.55662057e-02,   2.83074203e+00,  -3.84466489e-01],
       [ -1.17310000e+02,  -1.11983740e+0

In [12]:
def main(dataPath, testRatio, labelColumn):
    # input: the path of the csv file, test data percentage and name of the label column
    df = create_df(dataPath)
    nancolumns = nan_columns(df)
    #print nancolumns
    catcolumns = categorical_columns(df)
    #print catcolumns
    new_df1 = replace_missing_features(df, nancolumns)
    #print new_df1
    new_df2 = cat_to_num(new_df1, catcolumns)
    #print new_df2
    labelcol = labelColumn
    new_df3 = standardization(new_df2, labelcol)
    #print new_df3
    test_ratio = testRatio
    # output: X_train, X_test, y_train, y_test as numpy arrays
    return my_train_test_split(new_df3,labelcol,test_ratio)
print main("housing.csv", 0.3 , 'longitude')

(14448, 'train +', 6192, 'test')
(array([[ -1.19790000e+02,   5.14133728e-01,   1.85618152e+00, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       [ -1.22210000e+02,   1.00104776e+00,   1.14105882e+00, ...,
         -1.55662057e-02,   2.83074203e+00,  -3.84466489e-01],
       [ -1.18040000e+02,  -8.24879861e-01,  -9.24851228e-01, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       ..., 
       [ -1.21900000e+02,   4.43905743e-01,   1.87561872e-01, ...,
         -1.55662057e-02,  -3.53264264e-01,   2.60100692e+00],
       [ -1.17930000e+02,  -9.41926503e-01,   4.25936108e-01, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01],
       [ -1.15560000e+02,  -1.32583949e+00,  -1.08376738e+00, ...,
         -1.55662057e-02,  -3.53264264e-01,  -3.84466489e-01]]), array([[ -1.17050000e+02,  -1.42884054e+00,  -5.27560835e-01, ...,
         -1.55662057e-02,  -3.53264264e-01,   2.60100692e+00],
       [ -1.17970000e+02,  -8.01470532e-0

In [51]:
from sklearn.linear_model import LinearRegression
def model_evaluation(X_train, y_train):
    global coef_
    global intercept_
    coef_, intercept_ = [],[]

    Linear_Regression = LinearRegression()
    Linear_Regression.fit(X_train, y_train)
    
    coef_.append(Linear_Regression.coef_)
    intercept_.append(Linear_Regression.intercept_)
    
    return coef_, intercept_
print model_evaluation(X_train, y_train)

([array([  1.00000000e+00,   6.37311101e-16,   1.53972394e-16,
        -7.19837304e-17,  -1.41468204e-16,   5.88415502e-19,
        -8.81434134e-17,  -8.76531351e-17,  -7.98633117e-17,
         2.96374757e-16,   2.42561693e-16,   6.52671568e-17,
         5.20907293e-16,   1.26512736e-16])], [2.8421709430404007e-14])


In [50]:
def predict(instance, coef_, intercept_):
    # input: instance matrix, coef_ array and intercept_ array
    # ouput: list of predictions for input instances
    predictions = []
    return predictions