In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import time

# 1. Graduate Admission Rate

Graduate admission rate: Multiple Linear Regression

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [32]:
#Import data
admission_rate=pd.read_csv('/Users/mirandidallas-fuge/Desktop/Admission_Predict.csv')

y=admission_rate['Chance of Admit ']
x=admission_rate.iloc[:,1:8]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [34]:
%time
# Fit the Model
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
Multiple Linear Regression score: 0.8034713719824393
Multiple Linear Regression MSE: 0.003986893243246914
Multiple Linear Regression coef: [ 0.00173741  0.00291958  0.00571666 -0.00330517  0.02235313  0.11893945
  0.02452511]


Graduate admission rate: Lasso regression

In [35]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [36]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [37]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.1 µs
lasso score: 0.0
lasso MSE: 0.0202865775
lasso coef: [0. 0. 0. 0. 0. 0. 0.]


Graduate admission rate: Ridge regression

In [38]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [39]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.8034661029324273
ridge MSE: 0.003987000134238337
ridge coef: [ 0.02012161  0.01784985  0.00665018 -0.00311891  0.02005063  0.0701277
  0.01220766]


### Summary for graduate school data: 
- Multiple Linear Regression - MSE 0.003986893243246914
- Lasso Regression - MSE 0.0202865775
- Ridge Regression - MSE 0.003987000134238337

Thus since we want the smallest MSE of these, linear regression performs the best.

##### Data set attributes: 
- small dataset
- low dimension
- all numerical attributes

# 2. Crime incidence in community

In [40]:
#Import data
crime = pd.read_csv('/Users/mirandidallas-fuge/Desktop/Violent_Crime_pred')

In [41]:
#subseting the data
x=crime.iloc[:,2:-1]
y=crime['target']

#check the missing valuings in each column
#preliminary decision: remove the columns with 1675(over 80%) missing values,to be discussed
x.iloc[:,np.sort(x.isna().sum())==0]

x=x.dropna(axis='columns')

Crime: multiple linear regression 

In [42]:
# Fit the Model
%time
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs
Multiple Linear Regression score: 0.6752443763457237
Multiple Linear Regression MSE: 122683.07978516712
Multiple Linear Regression coef: [ 3.74445526e-04 -1.58839756e+02  7.68343831e+00 -7.78658010e-01
 -1.70607592e+00 -1.02564150e-01  1.52165078e+01 -2.63289129e+01
  1.34344029e+00 -8.12972136e+00 -1.01180719e-03  1.24852786e+00
 -1.19059838e-02 -1.38626681e+01  2.75088288e+01 -4.32705858e+00
  5.03746403e+00  9.85498925e+00 -1.18362183e+01  8.36688662e-03
 -7.82605464e-03 -5.27346511e-04 -5.37451125e-04 -1.25607890e-04
  1.91200132e-03  2.70297445e-03 -3.56959590e-04 -9.58268155e+00
 -1.79607712e+01  7.86021041e+00  2.76211020e+00 -4.64427961e+00
  9.96105628e+00 -4.64363581e+00 -1.12027574e+00  2.32085805e+00
  3.54829043e+00  1.57527151e+02  1.13562054e+01  1.01557620e+02
 -2.43140198e+02 -4.71190793e+02  4.67589646e+00 -1.51413164e+01
  1.65198009e+00  1.45334088e-01  3.79935929e+00 -1.15612466e+01
 -4.74529478e-03  4

Crime: Lasso regression

In [43]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [44]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.86 µs
lasso score: 0.6679159087013914
lasso MSE: 125451.55834327749
lasso coef: [  -0.           -0.          108.76167844  -10.42293264   -2.71448958
   -0.           23.76270866  -90.74356619    0.           -0.
  -39.38124579   44.54170032   -0.          -88.58945375   11.928513
  -40.50956356    3.55909249   15.79905137  -48.22904683   -0.
  -23.32834962   -0.           -4.98273642   -0.79228326   18.37421699
   13.12378838   -0.          -41.83149327 -108.93029885   71.23018038
    0.25681009   -9.36628505   62.73104936  -27.03053208   -0.
    0.            0.           82.65827505   61.90728474  -51.71247592
   -0.           -0.            0.         -124.30284639    0.
    0.            7.83219879  -53.26836367  -46.69694761  133.09193878
   12.81176371    5.01298517   -0.           -0.            1.4593455
   -0.          -20.36449862   -0.            0.            0.
   -5.29595132   -0.          -38.58646713   49.56574

Crime: Ridge regression

In [45]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [46]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.6747242483344726
ridge MSE: 122879.56878075335
ridge coef: [-2.56857091e+01 -4.77202128e+01  1.02102505e+02 -2.03964901e+01
 -1.01604232e+01  7.77758983e-02  5.54510403e+01 -1.58988455e+02
  1.39904822e+01 -3.85992457e+01 -1.05344752e+02  5.16520617e+01
 -1.06962164e+02 -1.12202573e+02  1.85328660e+01 -5.34995574e+01
  3.75105032e+01  4.22942152e+01 -5.31706604e+01  7.70608442e+01
 -4.78036383e+01 -4.57507178e+00 -5.32732623e+00 -1.64362773e+00
  1.90312541e+01  1.53340311e+01 -1.14990980e+01 -8.11960509e+01
 -1.26250898e+02  8.75680641e+01  3.49447165e+01 -1.25916426e+01
  7.88492401e+01 -3.79506654e+01 -9.43864495e+00  1.53785041e+01
  2.74480905e+01  2.25169485e+02  1.04035461e+02  6.67410922e+01
 -2.55209360e+02 -1.16829611e+02  5.49143459e+01 -1.72611384e+02
  1.72619001e+01  9.99623412e-01  2.90136622e+01 -7.62883266e+01
 -6.49534500e+01  1.26380587e+02  7.25947973e+01  1.64683940e+01
 -3.17158284e+00 -9.11127443e+00  1.46518850e+01 -3.22683709e+00
 -6.22548806e+01

### Summary for crime data: 
- Multiple Linear Regression - 122683.07978516712
- Lasso Regression - 125451.55834327749
- Ridge Regression - 122879.56878075335

Thus since we want the smallest MSE of these, none of these perform well.

##### Data set attributes: 
- moderate high in feature
- numeric attributes
- contains irrelevant features

# 3. Automobile

Automobile: multiple linear regression 

In [47]:
import numpy as np
from sklearn.preprocessing import Imputer
cars=pd.read_csv('/Users/mirandidallas-fuge/Desktop/automobile_price')

In [48]:
cars=cars.replace('?',np.nan).iloc[:,3:] #replace ? with NaN
cars['X6'][cars['X6'].isna()]='four' #mode is four, assign four to the missing value
impute=Imputer(strategy='mean') #use column mean to impute missing value
cars[['X19','X20','X22','X23']]=impute.fit_transform(cars[['X19','X20','X22','X23']])
pd.isna(cars).sum() #now no missing values
x=cars.iloc[:,0:-1]
y=cars.iloc[:,-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [49]:
#get categorical column
cate_col=list(x.columns[x.dtypes=='object'])
x=pd.get_dummies(x,columns=cate_col) #create dummy variables for the categorical var

In [50]:
# Fit the Model
%time
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 3.81 µs
Multiple Linear Regression score: 0.9661197143457417
Multiple Linear Regression MSE: 2129093.228056125
Multiple Linear Regression coef: [ 2.83172282e+02 -1.31090091e+02  6.01155476e+02 -3.10629298e+02
  6.17918143e+00  8.33002788e+01 -3.22372696e+03 -1.22082324e+03
 -8.99599743e+02  1.15936855e+01  2.25641131e+00 -1.76016213e+01
  1.31340365e+02  2.64630972e+03  2.84721599e+03  7.81369552e+03
 -2.70350725e+03 -3.47145019e+03 -4.31555305e+02 -1.26806954e+03
  1.15103898e+03 -2.23591966e+02  4.20420995e+03 -1.86706063e+03
 -3.96104619e+03 -7.46651168e+02 -3.83165576e+03 -3.40613151e+03
  5.36974272e+03 -1.59909883e+03  3.06628668e+03 -2.33645587e+03
 -1.25705622e+03 -1.96455930e+02  2.01286802e+02  4.05368464e+03
 -4.05368464e+03 -8.26640075e+02  8.26640075e+02  1.17327015e+02
 -1.17327015e+02  2.04942336e+03 -1.98809112e+02 -9.51280472e+02
 -3.39174780e+02 -5.60158992e+02  2.97892440e+02 -2.77338500e+02
 -2.05539408e+01

Automobile: Lasso regression

In [51]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [52]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
lasso score: 0.9660907994695022
lasso MSE: 2130910.2867379715
lasso coef: [ 1.66016226e+03 -1.54939864e+03  1.24682551e+03 -7.23706448e+02
  3.11068558e+03  3.36641453e+03 -8.11753712e+02 -3.90930159e+02
 -3.05876473e+03  5.37803243e+02  1.05024062e+03 -8.53708292e+01
  8.36680321e+02  3.78432719e+02  5.36668707e+02  1.61004914e+03
 -2.67365823e+02 -6.01148614e+02  0.00000000e+00 -6.97327564e+01
  2.37725868e+02  7.35743192e+01  9.01753183e+02 -1.06936028e+02
 -8.38818783e+02 -8.65898586e+01 -1.03900448e+02 -5.24683552e+02
  7.92499262e+02 -1.13384034e+02  5.68551489e+02 -0.00000000e+00
 -2.91761967e+02  4.81335165e+01  1.33384322e+02  3.07818467e+03
 -0.00000000e+00 -6.40968271e+02  2.35292162e-13  1.07962145e+02
 -1.62328967e-11  4.07109762e+02  1.86480426e+01 -2.80780566e+02
  0.00000000e+00 -7.62566053e+01  7.62492050e+01 -1.06848574e+02
  0.00000000e+00 -1.17600182e+03  1.72215763e-10 -1.63043594e+02
 -5.25290878e+02 



Automobile: Ridge regression

In [53]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.34 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [54]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.9654295868239218
ridge MSE: 2172461.9838038264
ridge coef: [ 1.42868961e+03 -1.15099000e+03  1.21610354e+03 -5.72356496e+02
  2.50494847e+03  2.73772741e+03 -5.45251083e+02 -4.28255580e+02
 -1.35050895e+03  1.12257977e+03  9.06462274e+02 -1.55074368e+02
  7.37188070e+02  3.46137154e+02  4.15053449e+02  1.50124771e+03
 -2.70706491e+02 -5.53609265e+02 -3.12187983e+01 -8.52366695e+01
  4.72962627e+02  1.07472686e+01  8.64450591e+02 -1.79228138e+02
 -7.94922431e+02 -1.86765386e+02 -2.99264248e+02 -4.98458249e+02
  6.67744262e+02 -1.44019413e+02  4.00055544e+02 -1.79527906e+02
 -3.92433565e+02 -5.60797922e+01  5.74758776e+00  4.88070954e+02
 -4.88070954e+02 -3.14260343e+02  3.14260343e+02  4.63060487e+01
 -4.63060487e+01  4.57384894e+02  2.40173071e+01 -1.77308144e+02
  3.86766570e+01 -5.43560343e+01  1.38925158e+02 -8.07800301e+01
  2.60821424e+01 -5.29443319e+02  5.29443319e+02 -1.96870297e+02
 -1.60126958e+02  4.19220312e+02  8.24283410e+01 -6.90733252e+02
  3.21283119e+02

### Summary for automobile data: 
- Multiple Linear Regression - 2129093.228056125
- Lasso Regression - 2130910.2867379715
- Ridge Regression - 2172461.9838038264

Thus since we want the smallest MSE of these, none of these perform well. 

##### Data set attributes: 
- categorical attributes 
- numerical attributes

# 4. Breast Cancer 

Breast Cancer: Multiple Linear Regression 

In [55]:
Breast_cancer_wisconsin = pd.read_csv('/Users/mirandidallas-fuge/Desktop/breast-cancer-wisconsin.data')

In [56]:
x=Breast_cancer_wisconsin.iloc[:,2:]

In [57]:
y=Breast_cancer_wisconsin.iloc[:,1]

In [58]:
import pandas as pd
X = pd.DataFrame(x)

In [59]:
X = X.replace('?', np.NaN)

In [60]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X)
X_fixed= imp.transform(X)

In [61]:
# Fit the Model
%time
reg = LinearRegression().fit(X_fixed, y)
# Print the Score
reg.score(X_fixed, y)
print("Multiple Linear Regression score:", reg.score(X_fixed, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(X_fixed)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 2.86 µs
Multiple Linear Regression score: 0.5330725624458194
Multiple Linear Regression MSE: 3.701758257116035
Multiple Linear Regression coef: [ 0.08716186  0.16076712 -0.09082431 -0.01948014  0.00987729 -0.00819105
 -0.00554684  0.07202413  1.64568907]


Breast Cancer: Lasso

In [62]:
# Scale data 
scaler = StandardScaler()
scaler.fit(X_fixed)
X = scaler.transform(X_fixed)

In [63]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X_fixed, y)
print("lasso score:", lasso.score(X_fixed, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X_fixed)))
print("lasso coef:", lasso.coef_)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs
lasso score: 0.4551149414498669
lasso MSE: 4.3197991859989155
lasso coef: [0.17375289 0.1993394  0.         0.         0.15732223 0.
 0.01514637 0.         0.        ]


Breast Cancer: Ridge Regression

In [64]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X_fixed, y)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 12.2 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [65]:
print("ridge score:",ridge.score(X_fixed, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X_fixed)))
print("ridge coef:", ridge.coef_)

ridge score: 0.5330684615436043
ridge MSE: 3.701790768695812
ridge coef: [ 0.0879629   0.16133473 -0.09066844 -0.01920239  0.01132331 -0.00753354
 -0.00499077  0.07213587  1.63168369]


### Summary for breast cancer data: 
- Multiple Linear Regression - 3.701758257116035
- Lasso Regression - 4.3197991859989155
- Ridge Regression - 3.701790768695812

Thus since we want the smallest MSE of these, multiple linear regression performs the best.

##### Data set attributes: 
- small dimensions

# 5. Parkinsons Data 

In [66]:
Parkinsons_data = pd.read_csv('/Users/mirandidallas-fuge/Desktop/parkinsons_updrs.data')

In [67]:
y = Parkinsons_data['total_UPDRS']

In [68]:
x = Parkinsons_data.drop(['total_UPDRS', 'motor_UPDRS'], axis=1)

Parkinsons: Multiple Linear Regression 

In [69]:
# Fit the Model
%time
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs
Multiple Linear Regression score: 0.25291808513025216
Multiple Linear Regression MSE: 85.52337735564949
Multiple Linear Regression coef: [ 2.63599144e-01  3.18670952e-01 -4.81201100e+00  1.59401035e-02
 -2.55549336e+02 -4.46094581e+04 -2.60297943e+04 -1.66919255e+02
  9.06235190e+03  1.41261449e+01 -5.88583107e-01 -1.49979833e+04
  4.96070754e+01  9.72737839e+00  4.94934363e+03 -2.37172174e+01
 -4.85772182e-01  1.69271943e+00 -3.63399992e+01  1.54854842e+01]


Parkinsons: Lasso Regression 

In [70]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [71]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs
lasso score: 0.17087504234391826
lasso MSE: 94.91538373134212
lasso coef: [ 1.9300962   2.30221711 -0.53213254  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -0.11427589  0.06575668
 -0.34042658  0.04626113]


Parkinsons: Ridge Regression 

In [72]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [73]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.25285758941088754
ridge MSE: 85.5303026982828
ridge coef: [ 3.26107376  2.81074917 -2.2407178   0.85276759 -1.39971816 -1.60425689
  1.60078879 -0.61393149  1.97116337  0.33954156 -0.12361482 -1.00346174
  0.80148824  0.20283382 -0.95398511 -1.41326791 -2.08745998  0.16700561
 -2.5740666   1.4119164 ]


### Summary for parkinson data: 
- Multiple Linear Regression - 10.30124083758564
- Lasso Regression - 12.762792705087232
- Ridge Regression - 10.303097064425131

Thus since we want the smallest MSE of these, linear performs the best but due to the large MSE values non of them perform that well.

##### Data set attributes: 
- small data set

# 6. Mercedes-Benz Greener Manufacturing

In [74]:
Mercedes = pd.read_csv('/Users/mirandidallas-fuge/Desktop/train_mercedes.csv')
y = Mercedes['y']
x = Mercedes.drop(['y'], axis=1)

cate_col=list(x.columns[x.dtypes=='object'])
x=pd.get_dummies(x,columns=cate_col) #change categorical variables to dummy var

Mercedes-Benz Greener Manufacturing: Multiple Linear Regression

In [75]:
# Fit the Model
%time
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.29 µs
Multiple Linear Regression score: 0.6201092675543345
Multiple Linear Regression MSE: 61.05927281061774
Multiple Linear Regression coef: [ 1.22650732e-03 -2.85213807e+00  7.55333573e-11  2.40511832e+00
  2.37282897e-01  7.94977063e-01  3.25719285e+00  5.57692485e-02
 -7.52309158e-01  4.19347527e+00  7.99439447e-01  1.15854791e+00
  2.85544617e-01  5.28697920e-01 -4.03985314e-01 -5.02202737e+00
 -6.92738650e-02 -6.67002657e-02 -7.35128459e-01 -4.80594440e+00
 -3.31301054e+00  2.79771243e-01 -1.79107918e+00 -1.82973449e+00
  8.93294928e-01  2.79771243e-01 -3.76314245e-01  2.79771243e-01
  1.62928787e-01 -1.82973449e+00  1.51570151e+00  4.21029399e-01
  6.48579544e-01  1.66768725e+00 -1.22348910e+00 -6.56075964e+00
  1.58345080e-01  5.90532717e+00  1.30001424e+00  6.86854577e+00
 -1.31045948e-01  1.50234680e-02 -4.06390555e-01  3.97004575e-01
  1.27715410e+00 -3.95542855e+00  2.78087799e-01 -4.53651321e-01
  1.60985787e-01 -6.

Mercedes-Benz Greener Manufacturing: Lasso Regression 

In [76]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [77]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.81 µs
lasso score: 0.5205575167814709
lasso MSE: 77.06007775282386
lasso coef: [-0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -1.95334828e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.19220168e-03  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00 -1.14340067e-01
  0.00000000e+

Mercedes-Benz Greener Manufacturing: Ridge Regression 

In [78]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [79]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.6200545035766019
ridge MSE: 61.068074943365346
ridge coef: [ 2.38716907e+00 -1.28056999e-01  0.00000000e+00  3.29271354e-01
  1.68236721e-01 -1.88278731e-01  3.60088511e-02  3.46866767e-02
 -1.24714668e-01  2.78846200e-01 -1.03512167e-01  1.37764921e-03
  1.32278276e-01 -6.75020065e-02 -3.80003540e-03 -2.19492450e-01
  4.35226209e-02 -3.11989210e-02  1.05906679e-02 -4.07756542e-01
 -7.10079359e-02  1.15010464e-01 -1.28434935e-01 -2.72698911e-02
  1.90431991e-01  1.15010464e-01  5.53978605e-04  1.15010464e-01
  2.72512418e-02 -2.72698911e-02  3.28094249e-02  1.18273697e-02
  1.16806755e-02  2.90462395e-01  2.97500176e-02 -2.80536693e+00
  7.74244950e-02  6.64035048e-01  1.66685479e-01  1.96878611e+00
 -5.14019037e-02  2.48966151e-03  1.13115396e-02  2.32938166e-02
 -3.66841822e-01 -2.86850727e-01  4.12088399e-02 -4.85265119e-02
  1.42881912e-01  2.79469340e-02 -2.51032032e-02 -1.86954021e-02
  1.30378910e-01 -3.35509896e-01 -1.41390487e-01 -1.44804067e-01
  1.14647057e-01

### Summary for Mercedes data: 
- Multiple Linear Regression - 61.05927281061774
- Lasso Regression - 77.06007775282386
- Ridge Regression - 61.068074943365346

Thus since we want the smallest MSE of these, multiple linear regression performs the best

##### Data set attributes: 
- categorical data

# 7. Song Year Prediction 

In [80]:
song = pd.read_csv('/Users/mirandidallas-fuge/Desktop/YearPredictionMSD.txt')

In [81]:
y = song.iloc[:,0]
x = song.iloc[:,1:]

Song: Multiple Linear Regression 

In [82]:
# Fit the Model
%time
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs
Multiple Linear Regression score: 0.23700077314993973
Multiple Linear Regression MSE: 91.1690694545467
Multiple Linear Regression coef: [ 8.75417828e-01 -5.63271810e-02 -4.36495594e-02  3.35266425e-03
 -1.47466963e-02 -2.20070428e-01 -6.73789497e-03 -1.00895620e-01
 -7.04729895e-02  2.50706569e-02 -1.65701122e-01 -1.85435368e-03
  4.70143401e-02  3.55103260e-04 -4.22581347e-04  5.99183918e-04
  4.76576350e-04  1.46662665e-03  1.92446773e-03  2.12835187e-03
  7.69875469e-04 -4.02598883e-04  7.53936242e-03  2.81199236e-03
 -3.55559424e-03  7.11398144e-05  1.58942049e-03  5.29413275e-04
  8.74563154e-04 -3.04158379e-04 -1.40496765e-03 -1.40135040e-03
 -5.55962706e-03  2.47233882e-03  1.84962263e-03 -5.29414513e-03
 -2.77269168e-04  6.79196361e-04  1.36514213e-03 -1.71044631e-03
 -1.99135793e-03 -7.64136852e-04 -1.40254468e-03 -2.35902050e-03
 -3.17994565e-03  6.81260459e-03  4.56073617e-04 -2.07499004e-03
  2.75161566e-04  1.

Song: Lasso Regression

In [83]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [84]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 20.3 µs
lasso score: 0.09764067500655949
lasso MSE: 107.82089558978214
lasso coef: [ 1.68710504 -0.         -0.63795672  0.          0.         -0.86141556
  0.         -0.         -0.          0.          0.         -0.
  0.          0.29605591  0.          0.          0.          0.
  0.          0.52770142  0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
 -0.          0.         -0.          0.         -0.         -0.
 -0.          0.          0.         -0.00232873 -0.         -0.
 -0.         -0.         -0.         -0.          0.         -0.
 -0.         -0.          0.         -0.          0.         -0.
 -0.          0.         -0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.         -0.          0.         -0.         -0.
  0.          0.          0.         -0.          0.         -0.
 -0.

Song: Ridge Regression

In [85]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.05 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [86]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.23700077314129467
ridge MSE: 91.16906945557967
ridge coef: [ 5.31160293e+00 -2.90534331e+00 -1.53943734e+00  5.46923690e-02
 -3.37113347e-01 -2.82957692e+00 -9.81646368e-02 -8.03506918e-01
 -7.45796459e-01  1.63720177e-01 -7.24224333e-01 -1.54414092e-02
  1.04651115e+00  6.21202788e-01 -5.33077829e-01  6.54810843e-01
  2.26711663e-01  8.46037751e-01  6.11013061e-01  6.58441345e-01
  1.64762301e-01 -6.67066205e-02  1.40954825e+00  4.31568378e-01
 -4.31745114e-01  5.09175936e-02  8.66034495e-01  1.15605873e-01
  1.42742202e-01 -4.10266604e-02 -1.39216946e-01 -1.00630539e-01
 -4.13699128e-01  1.32311792e-01  7.85886032e-02 -5.71301946e-01
 -1.15074828e-01  3.07948300e-01  3.55821050e-01 -3.51905638e-01
 -2.38634044e-01 -9.15784860e-02 -1.03259693e-01 -9.05238537e-02
 -1.32111584e-01  3.74513507e-01  2.14627667e-01 -5.44409368e-01
  5.76530964e-02  2.37123064e-01  2.05938188e-02 -1.20442028e-01
  1.37923342e-01  4.08665428e-02 -4.87585561e-03  4.45998036e-02
 -5.90419691e-01

### Summary for Song data: 
- Multiple Linear Regression -  91.1690694545467
- Lasso Regression - 107.82089558978214
- Ridge Regression - 91.16906945557967

Thus since we want the smallest MSE of these, none of these models perform well 

##### Data set attributes: 
- lots of columns

# 8. Solar Flares

In [87]:
solar = pd.read_csv('/Users/mirandidallas-fuge/Desktop/flare.data2', sep = " ")

In [88]:
x = solar.iloc[:,1:6]
y = solar.iloc[:,7]

Solar: Multiple Linear Regression

In [89]:
# Fit the Model
%time
reg = LinearRegression().fit(x, y)
# Print the Score
reg.score(x, y)
print("Multiple Linear Regression score:", reg.score(x, y))
print("Multiple Linear Regression MSE:", mean_squared_error(y, reg.predict(x)))
print("Multiple Linear Regression coef:", reg.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
Multiple Linear Regression score: 0.07079476730209577
Multiple Linear Regression MSE: 0.0851232049849271
Multiple Linear Regression coef: [0.06338076 0.02650041 0.02537617 0.42973347 0.        ]


Solar: Lasso Regression

In [90]:
# Scale data 
scaler = StandardScaler()
scaler.fit(x)
X = scaler.transform(x)

In [91]:
# Fit the Model and Print Score
%time
lasso = linear_model.Lasso()
lasso.fit(X, y)
print("lasso score:", lasso.score(X, y))
print("lasso MSE:", mean_squared_error(y, lasso.predict(X)))
print("lasso coef:", lasso.coef_)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
lasso score: 0.0
lasso MSE: 0.0916086156098969
lasso coef: [0. 0. 0. 0. 0.]


Solar: Ridge Regression 

In [92]:
# Fit the Model and Print Score
%time
ridge = linear_model.Ridge()
ridge.fit(X, y)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [93]:
print("ridge score:",ridge.score(X, y))
print("ridge MSE:", mean_squared_error(y, ridge.predict(X)))
print("ridge coef:", ridge.coef_)

ridge score: 0.07079472336923387
ridge MSE: 0.08512320900955576
ridge coef: [0.02029078 0.013006   0.00838167 0.06745725 0.        ]


### Summary for Solar data: 
- Multiple Linear Regression - 0.0851232049849271
- Lasso Regression - 0.0916086156098969
- Ridge Regression - 0.08512320900955576

Thus since we want the smallest MSE of these, multiple linear regression performs the best 

##### Data set attributes: 
- small data set