# Scikit-learn Intro
Scikit-learn is a very powerful machine learning library in Python.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read in the data
housing = pd.read_csv('data/housing.csv')

In [None]:
housing.corrwith(housing['SalePrice']).sort_values(ascending=False)

In [None]:
housing_best = housing[['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']]
sale_price = housing['SalePrice']

In [None]:
housing_best.head()

In [None]:
housing_best.info()

In [None]:
housing_best = housing_best.astype({'OverallQual':'category', 'GarageCars':'category'})
housing_best.head()

In [None]:
housing_best.info()

In [None]:
housing_best_numeric = housing_best.select_dtypes(['number'])
housing_best_cat = housing_best.select_dtypes(['category'])

In [None]:
housing_best_numeric_scaled = housing_best_numeric.apply(lambda x: (x - x.mean())/x.std())

In [None]:
housing_best_concat = pd.concat([housing_best_numeric_scaled, housing_best_cat], axis='columns')

In [None]:
housing_best_concat.dtypes

In [None]:
housing_best_dummy = pd.get_dummies(housing_best_concat)
housing_best_dummy.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [None]:
X = housing_best_dummy.values
y = np.log(sale_price.values)

lm = LinearRegression()

In [None]:
lm.fit(X, y)
cross_val_score(lm, X, y, cv=10, scoring='r2')

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso = Lasso(alpha=.001)
cross_val_score(lasso, X, y, cv=10, scoring='r2')

In [None]:
alphas = np.logspace(-4,0)
scores = []

for alpha in alphas:
    lasso = Lasso(alpha=alpha)
    scores.append(cross_val_score(lasso, X, y, cv=10, scoring='r2').mean())

In [None]:
plt.plot(alphas, scores)

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
alphas = np.logspace(-4,2)
scores = []
for alpha in alphas:
    ridge = Ridge(alpha=alpha)
    scores.append(cross_val_score(ridge, X, y, cv=10, scoring='r2').mean())
plt.plot(alphas, scores)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(n_estimators=20)

In [None]:
cross_val_score(rfr, X, y, cv=10, scoring='r2')

## Suppor Vector Machines

In [None]:
from sklearn.svm import SVR

In [None]:
svr = SVR(kernel='linear')

In [None]:
svr = SVR(kernel='linear')
cross_val_score(svr, X, y, cv=10, scoring='r2').mean()

In [None]:
svr = SVR(kernel='rbf')
cross_val_score(svr, X, y, cv=10, scoring='r2').mean()

## Resources
+ [Scikit-learn docs](http://scikit-learn.org/stable/)