In [1]:
%%html
<style>
.output_wrapper, .output {
    height:auto !important;
    max-height:700px;  /* your desired max-height here */
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

### Import modules

In [48]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import numpy as np
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RandomizedLasso
from sklearn.datasets import load_boston
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

## Read in the dataset

In [30]:
data = pd.read_csv('data-titanic.csv')

In [31]:
data = data.drop(['name', 'ticket', 'cabin', 'body', 'boat', 'home.dest'], axis=1)
data = data.dropna()
from sklearn import preprocessing
encoded_data = data.copy()
le = preprocessing.LabelEncoder()
encoded_data.sex = le.fit_transform(encoded_data.sex)
encoded_data.embarked = le.fit_transform(encoded_data.embarked)
features = encoded_data.drop(['survived'], axis=1).values
labels = encoded_data['survived'].values

### Using all features

In [5]:
lin_reg = LinearRegression()

In [6]:
cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error')

array([-0.16413987, -0.13907219, -0.12241864, -0.12645509, -0.11539907,
       -0.16935641, -0.16038081, -0.19085267, -0.18679015, -0.16801512])

In [7]:
-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error')

array([ 0.16413987,  0.13907219,  0.12241864,  0.12645509,  0.11539907,
        0.16935641,  0.16038081,  0.19085267,  0.18679015,  0.16801512])

In [8]:
np.sqrt(-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error'))

array([ 0.40514179,  0.37292384,  0.34988376,  0.35560525,  0.33970439,
        0.41152936,  0.40047574,  0.43686688,  0.43219226,  0.40989647])

In [9]:
np.sqrt(-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error')).mean()

0.3914219735723462

### Choosing features manually

In [10]:
encoded_data.columns

Index(['pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

In [11]:
features = encoded_data[['sex', 'pclass']].values

In [13]:
np.sqrt(-cross_val_score(lin_reg, features, labels, cv=10, scoring='neg_mean_squared_error')).mean()

0.39855172858512533

### Feature Selection using Recursive Feature Elimination

In [51]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=1)
rfe.fit(features,labels)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
  n_features_to_select=1, step=1, verbose=0)

In [52]:
sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))

[(1, 'parch'),
 (2, 'fare'),
 (3, 'sibsp'),
 (8, 'pclass'),
 (9, 'age'),
 (10, 'sex'),
 (13, 'embarked')]

### Feature selection using Randomized Lasso 

In [43]:
boston = load_boston()
features = boston["data"]
labels = boston["target"]

In [45]:
model = RandomizedLasso(alpha=0.025)
model.fit(features, labels)

RandomizedLasso(alpha=0.025, eps=2.2204460492503131e-16, fit_intercept=True,
        max_iter=500, memory=Memory(cachedir=None), n_jobs=1,
        n_resampling=200, normalize=True, pre_dispatch='3*n_jobs',
        precompute='auto', random_state=None, sample_fraction=0.75,
        scaling=0.5, selection_threshold=0.25, verbose=False)

In [47]:
sorted(zip(map(lambda x: round(x, 4), model.scores_), 
                 boston["feature_names"]), reverse=True)

[(1.0, 'RM'),
 (1.0, 'PTRATIO'),
 (1.0, 'LSTAT'),
 (0.57499999999999996, 'B'),
 (0.56999999999999995, 'CHAS'),
 (0.40500000000000003, 'CRIM'),
 (0.33500000000000002, 'TAX'),
 (0.22, 'DIS'),
 (0.17999999999999999, 'NOX'),
 (0.14499999999999999, 'INDUS'),
 (0.070000000000000007, 'ZN'),
 (0.025000000000000001, 'RAD'),
 (0.01, 'AGE')]