###### Similar to 'Assignment Power using Linear Regression'

In [111]:
#Import the libraries
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [112]:
filename = "CoEPrA.csv"
raw_data = open(filename, 'rt')
data = np.loadtxt(raw_data, delimiter=",")

In [113]:
data.shape #What if we had 5788 features

(89, 5788)

In [114]:
#We separate out the independent variable into X 
#and dependent variable into y
#OMIT LAST ROW AS IT HAS NULL VALUE WHICH NEEDS TO BE PREDICTED
X=data[:,0:5787]
y=data[:,5787]

In [115]:
#We split the data into train and test using train_test_split
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
print(X_trn.shape)
print(y_trn.shape)
print(X_tst.shape)
print(y_tst.shape)

(71, 5787)
(71,)
(18, 5787)
(18,)


In [117]:
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_trn, y_trn)

In [118]:
# Make predictions using the testing set
y_pred = regr.predict(X_trn)

In [119]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_trn, y_pred))

Mean squared error: 0.02


###### Low mean Squared Error

In [120]:
#K-Fold Cross validation - Check for Over-Fitting - Using Mean Square error
scores = cross_val_score(regr, X_trn, y_trn, scoring='neg_mean_squared_error', cv=5)

In [121]:
print(np.mean(scores))

-1.7382471776771757e+25


###### Extremely High Mean Squared Error

In [122]:
# Make predictions using the testing set
y_pred_tst = regr.predict(X_tst)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_tst, y_pred_tst))

Mean squared error: 648646434007075088498688.00


###### We get a really high test error

##### L1/Lasso Regularization

In [123]:

regr = linear_model.Lasso(alpha=0.3, max_iter=1000000)
regr.fit(X_trn, y_trn)

##### Checking the Weights

In [124]:

print(regr.coef_)

[-0.  0. -0. ... -0. -0.  0.]


###### Many Coefficients became zero

In [125]:

#Index of all non zero coffecients 
# Only use non zero data
index=np.nonzero(regr.coef_)
print(index[0])

[  64  136  445  451  653  715  760  787  858 1236 1358 1422 1430 1732
 1737 1874 1879 2065 2247 2374 2380 2581 2644 2689 2708 2890 3224 3351
 3666 3931 3994 4002 4221 4303 4510 4573 4574 4637 4645 4819 4952 5153
 5154 5280 5589 5595 5648 5732]


In [126]:
#New feature matrix with only selelcted features
X_trn_filter=X_trn[:,index[0]]

In [127]:
#New shape
X_trn_filter.shape

(71, 48)

In [128]:
# Make predictions using the testing set
y_pred = regr.predict(X_trn)


In [129]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_trn, y_pred))

Mean squared error: 0.05


In [130]:
#K-Fold Cross validation 
scores = cross_val_score(regr, X_trn, y_trn, scoring='neg_mean_squared_error', cv=5)

In [131]:
print(np.mean(scores))


-1.1615211159922416


In [132]:
# Make predictions using the testing set
y_pred_tst = regr.predict(X_tst)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_tst, y_pred_tst))
print(X_trn[1],"\n Predicted Value: %.2f" 
      % y_pred[1],"\n Actual Value: %.2f" 
      % y_tst[1])


Mean squared error: 0.69
[ 3.95  2.22  1.45 ... -0.17 -0.39  0.54] 
 Predicted Value: 5.17 
 Actual Value: 5.65


###### Overfitting has reduced

##### L2 Ridge Regularization

In [133]:
#Using the filtered features we obtainied from L1
regr = linear_model.Ridge(alpha=0.8,max_iter=1000000)
regr.fit(X_trn_filter, y_trn)
# Make predictions using the testing set
y_pred = regr.predict(X_trn_filter)

In [134]:
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_trn, y_pred))

Mean squared error: 0.03


In [135]:
scores = cross_val_score(regr, X_trn_filter, y_trn, scoring='neg_mean_squared_error', cv=5)
print(np.mean(scores))

-1.2017669016766912


###### Cross validation values does not change much

In [136]:
#Filtering the test features
X_tst_filter=X_tst[:,index[0]]

In [137]:
# Make predictions using the testing set
y_pred_tst = regr.predict(X_tst_filter)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_tst, y_pred_tst))
print(X_tst_filter[1],"\n Predicted Value: %.2f" 
      % y_pred_tst[1],"\n Actual Value: %.2f" 
      % y_tst[1])

Mean squared error: 1.80
[ 4.1000e+01  2.9200e+02 -1.1000e+01 -7.0000e+00  1.0800e+02  3.3700e+02
  2.3230e+02  1.5600e+01  1.5728e+01  1.9000e+02  2.8400e+02  3.0300e+02
  5.4700e+01  1.0800e+02  1.2000e+02  5.9000e+02  2.3700e+02  3.9700e+02
  1.4630e+02 -9.3000e+01 -9.7000e+01 -1.3000e+01  2.9000e+02  1.2790e+02
  7.0700e+02  5.2600e+01 -2.0000e+01  3.6600e+02 -1.3200e+02  5.6300e+00
  5.5300e+02  2.9500e+01  1.0000e-01  1.0000e+02 -3.8000e+01  2.5300e+02
 -2.8000e+01  4.9000e+02  3.1000e+01  1.4210e+02  1.7400e+02  1.2300e+02
  1.1700e+02  5.5300e+02  1.0000e+02  1.1400e+02  1.3840e+02  4.9000e+02] 
 Predicted Value: 5.06 
 Actual Value: 5.65


In [138]:
print(X_trn.shape)
print(y_trn.shape)
print(X_tst.shape)
print(y_tst.shape)
print(X_tst_filter.shape)
print(y_pred_tst.shape)


(71, 5787)
(71,)
(18, 5787)
(18,)
(18, 48)
(18,)
