In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

from sklearn.linear_model import LinearRegression, RANSACRegressor, LogisticRegression

<h1>Linear and Logistic Regression Demos</h1>

In [3]:
housing = pd.read_fwf("https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data", header = None)

In [4]:
housing

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [5]:
housing.columns = ["crime_rate", "zoned_land", "industry", "bounds_river", "nox_conc", "rooms", "age", "distance",  "highways", "tax", "pt_ratio", "b_estimator", "pop_status", "price"]

In [6]:
housing.head()

Unnamed: 0,crime_rate,zoned_land,industry,bounds_river,nox_conc,rooms,age,distance,highways,tax,pt_ratio,b_estimator,pop_status,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [7]:
housing.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
crime_rate,506.0,1.71629,2.65351,0.00632,0.0819,0.250895,2.326717,9.96654
zoned_land,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
industry,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
bounds_river,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
nox_conc,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
rooms,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
age,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
distance,506.0,3.696228,1.999689,0.5857,2.0737,3.1073,5.112625,9.2229
highways,506.0,4.332016,1.417166,1.0,4.0,4.0,5.0,8.0
tax,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [8]:
housing.dtypes

crime_rate      float64
zoned_land      float64
industry        float64
bounds_river      int64
nox_conc        float64
rooms           float64
age             float64
distance        float64
highways          int64
tax             float64
pt_ratio        float64
b_estimator     float64
pop_status      float64
price           float64
dtype: object

In [9]:
housing_prices = housing.price

In [10]:
housing_attributes = housing.drop("price", axis = 1)

In [11]:
housing_attributes.shape, housing_prices.shape

((506, 13), (506,))

In [12]:
scaler = MinMaxScaler()
scaler.fit(housing_attributes)

MinMaxScaler()

In [13]:
scaler.data_max_

array([  9.96654, 100.     ,  27.74   ,   1.     ,   0.871  ,   8.78   ,
       100.     ,   9.2229 ,   8.     , 711.     ,  22.     , 396.9    ,
        37.97   ])

In [14]:
scaler.feature_range

(0, 1)

In [15]:
housing_attributes_scaled = scaler.transform(housing_attributes)

In [16]:
housing_attributes_scaled.mean(axis = 0)

array([0.17167998, 0.11363636, 0.39137752, 0.06916996, 0.34916679,
       0.52186901, 0.67636355, 0.36013158, 0.47600226, 0.42220831,
       0.62292911, 0.89856783, 0.30140903])

In [17]:
linear_regression = LinearRegression() # We can add property fit_intercept = False
linear_regression.fit(housing_attributes_scaled, housing_prices)

LinearRegression()

In [18]:
linear_regression.coef_

array([  2.08448854,   1.49403979,   0.34690497,   3.00565375,
        -7.54441381,  22.43940145,   0.27658754,  -9.35981793,
         1.35281035,  -1.26826011,  -9.07603108,   3.74177288,
       -19.03479847])

In [19]:
linear_regression.intercept_

23.68929153467697

In [20]:
housing_attributes_scaled[0].reshape(-1, 1)

array([[0.        ],
       [0.18      ],
       [0.06781525],
       [0.        ],
       [0.31481481],
       [0.57750527],
       [0.64160659],
       [0.40572176],
       [0.        ],
       [0.20801527],
       [0.28723404],
       [1.        ],
       [0.08967991]])

In [21]:
linear_regression.predict(housing_attributes_scaled[:10])

array([30.10947333, 25.12810976, 31.00785588, 29.04535626, 28.48368175,
       25.44231142, 23.23025545, 20.07399474, 11.87137324, 19.61327434])

In [22]:
housing_prices[:10].values

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9])

In [23]:
random_sample = pd.DataFrame(housing_attributes_scaled).sample(10)

In [24]:
random_sample

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
399,0.994981,0.0,0.646628,0.0,0.633745,0.438973,0.77137,0.105902,0.428571,0.914122,0.808511,0.851884,0.779249
463,0.583805,0.0,0.646628,0.0,0.674897,0.565626,0.895984,0.256553,0.428571,0.914122,0.808511,0.992234,0.236203
363,0.423291,0.0,0.646628,1.0,0.792181,0.429584,0.886715,0.152712,0.428571,0.914122,0.808511,0.889404,0.356236
112,0.011744,0.0,0.350073,0.0,0.333333,0.450661,0.92688,0.204661,0.714286,0.467557,0.553191,0.995083,0.399558
291,0.007283,0.8,0.164589,0.0,0.053498,0.687296,0.255407,0.524591,0.428571,0.110687,0.702128,1.0,0.050497
195,0.000752,0.8,0.0,0.0,0.076132,0.826595,0.299691,0.586151,0.428571,0.129771,0.191489,0.993267,0.034216
480,0.584093,0.0,0.646628,0.0,0.302469,0.5137,0.636457,0.328637,0.428571,0.914122,0.808511,1.0,0.24862
395,0.874522,0.0,0.646628,0.0,0.633745,0.557578,0.987642,0.131987,0.428571,0.914122,0.808511,0.987594,0.424669
380,0.90057,0.0,0.646628,0.0,0.588477,0.652807,0.916581,0.096189,0.428571,0.914122,0.808511,1.0,0.427152
145,0.23825,0.0,0.70088,0.0,1.0,0.49224,1.0,0.09649,0.571429,0.412214,0.223404,0.435196,0.719371


In [25]:
linear_regression.predict(random_sample)

array([10.71588058, 21.87666833, 18.91000997, 21.244129  , 31.98371645,
       39.19915904, 22.56806732, 20.19904787, 23.04630678, 13.4616175 ])

In [26]:
housing_prices.loc[random_sample.index].values

array([ 6.3, 20.2, 16.8, 18.8, 37.3, 50. , 23. , 13.1, 10.4, 13.8])

In [27]:
linear_regression.score(housing_attributes_scaled, housing_prices)

0.7198065414937174

In [28]:
ransac = RANSACRegressor() # Accepts other ML model - a way to better a model

In [29]:
ransac.fit(housing_attributes_scaled, housing_prices)

RANSACRegressor()

In [30]:
ransac.estimator_.coef_

array([ -1.62917998,   4.24932113,   1.29849667,   0.80364321,
         6.52326609,  31.64257103,  -5.88649992,  -3.26122107,
         4.83171364,  -3.28081408,  -3.40762012,   4.91579172,
       -10.53214943])

In [31]:
ransac.estimator_.intercept_

7.215937416492341

In [32]:
ransac.predict(random_sample)

array([11.8699425 , 22.00041564, 18.14945842, 18.67794827, 33.39875419,
       39.10547488, 19.12745869, 18.8626288 , 22.10806462, 18.8458206 ])

In [33]:
housing_prices.loc[random_sample.index].values

array([ 6.3, 20.2, 16.8, 18.8, 37.3, 50. , 23. , 13.1, 10.4, 13.8])

In [34]:
ransac.score(housing_attributes_scaled, housing_prices)

0.6459719476890974

In [35]:
ransac.inlier_mask_

array([ True,  True, False, False, False, False,  True, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
       False,  True,

In [36]:
inliers = housing_attributes_scaled[ransac.inlier_mask_]
outliers = housing_attributes_scaled[~ransac.inlier_mask_]

In [37]:
ransac.score(inliers, housing_prices.loc[ransac.inlier_mask_])

0.9248121462487743

In [38]:
ransac.score(outliers, housing_prices.loc[~ransac.inlier_mask_])

0.5131273827136289

In [39]:
polynomial_features = PolynomialFeatures()

In [40]:
housing_attributes_scaled_poly = polynomial_features.fit_transform(housing_attributes_scaled)

In [41]:
housing_attributes_scaled_poly.shape

(506, 105)

In [42]:
quadratic_regression = LinearRegression()
quadratic_regression.fit(housing_attributes_scaled_poly, housing_prices)

LinearRegression()

In [43]:
quadratic_regression.score(housing_attributes_scaled_poly, housing_prices)

0.8527008230672175

In [44]:
iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)
iris.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]

In [45]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [46]:
iris_attributes = iris.drop("class", axis = 1)
iris_classes = iris["class"]

In [47]:
iris_attributes_scaled = MinMaxScaler().fit_transform(iris_attributes)

In [48]:
logistic_regression = LogisticRegression(C = 1e9)

In [49]:
logistic_regression.fit(iris_attributes_scaled, iris_classes)

LogisticRegression(C=1000000000.0)

In [50]:
logistic_regression.coef_

array([[-18.29679129,  29.19813694, -37.89417388, -38.3219437 ],
       [ 13.58611302,  -6.58018128,  -8.87560643,  -2.78717031],
       [  4.71067827, -22.61795566,  46.76978031,  41.10911401]])

In [51]:
logistic_regression.intercept_

array([ 25.9014194 ,  14.7266206 , -40.62803999])

In [52]:
logistic_regression.score(iris_attributes_scaled, iris_classes)

0.9866666666666667

In [53]:
PolynomialFeatures(degree = 7).fit_transform(iris_attributes_scaled)

array([[1.00000000e+00, 2.22222222e-01, 6.25000000e-01, ...,
        5.77244430e-10, 3.54764806e-10, 2.18032537e-10],
       [1.00000000e+00, 1.66666667e-01, 4.16666667e-01, ...,
        5.77244430e-10, 3.54764806e-10, 2.18032537e-10],
       [1.00000000e+00, 1.11111111e-01, 5.00000000e-01, ...,
        3.24699992e-10, 2.66073604e-10, 2.18032537e-10],
       ...,
       [1.00000000e+00, 6.11111111e-01, 4.16666667e-01, ...,
        1.57581908e-01, 1.75247340e-01, 1.94893123e-01],
       [1.00000000e+00, 5.27777778e-01, 5.83333333e-01, ...,
        3.59963549e-01, 4.42455196e-01, 5.43851178e-01],
       [1.00000000e+00, 4.44444444e-01, 4.16666667e-01, ...,
        8.61097024e-02, 8.77723898e-02, 8.94671819e-02]])