# DAP 4

# Imports

In [32]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

from sklearn import metrics

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [33]:
# grab data and split
df = pd.read_csv("../DAP2/processeddata/2018_Financial_Data.csv", index_col=0)
target_cols = ['priceCashFlowRatio', 
               'priceEarningsRatio', 
               'priceEarningsToGrowthRatio', 
               'priceBookValueRatio', 
               'currentRatio', 
               'quickRatio',
               'payoutRatio']
X = df[target_cols]
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# scale data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Polynomial data

kf = KFold(n_splits=10, random_state=None, shuffle=True) 

## Hyper-Parameter Finding

### Ridge Regression Model with Feature Normalization

In [34]:
alpha_range = [0, 1, 10, 20, 50, 100, 1000]

for alpha in alpha_range:
    # Preform K-Fold cross validation
    # Get average of R2 train and test, and RMSE
    
    r2_train_scores = []
    r2_val_scores = []
    rmse_scores = []
    
    for train_index, val_index in kf.split(X_train_scaled):    
        cv_X_train = X_train_scaled[train_index]
        cv_X_val   = X_train_scaled[val_index]
        cv_y_train = y_train[train_index]
        cv_y_val   = y_train[val_index]
        
        linridge = Ridge(alpha = alpha).fit(cv_X_train, cv_y_train)
        r2_train = linridge.score(cv_X_train, cv_y_train)
        r2_val = linridge.score(cv_X_val, cv_y_val)
         
        r2_train_scores.append(r2_train)
        r2_val_scores.append(r2_val)
        
        y_pred = linridge.predict(cv_X_val)
        rmse_scores.append(np.sqrt(metrics.mean_squared_error(cv_y_val, y_pred)))
    
    np.mean(r2_train_scores)
    np.mean(r2_val_scores)
    np.mean(rmse_scores)

### Lasso Regression Model with Feature Normalization

In [35]:
alpha_range = [0.5, 1, 2, 3, 5, 10, 20, 50]

for alpha in alpha_range:
    # Preform K-Fold cross validation
    # Get average of R2 train and test, and RMSE
    
    r2_train_scores = []
    r2_val_scores = []
    rmse_scores = []
    
    for train_index, val_index in kf.split(X_train_scaled):    
        cv_X_train = X_train_scaled[train_index]
        cv_X_val   = X_train_scaled[val_index]
        cv_y_train = y_train[train_index]
        cv_y_val   = y_train[val_index]
        
        linlasso = Lasso(alpha, max_iter = 10000).fit(cv_X_train, cv_y_train)
        r2_train = linlasso.score(cv_X_train, cv_y_train)
        r2_val = linlasso.score(cv_X_val, cv_y_val)
         
        r2_train_scores.append(r2_train)
        r2_val_scores.append(r2_val)
        
        y_pred = linlasso.predict(cv_X_val)
        rmse_scores.append(np.sqrt(metrics.mean_squared_error(cv_y_val, y_pred)))
    
    np.mean(r2_train_scores)
    np.mean(r2_val_scores)
    np.mean(rmse_scores)

### Polynomial Regression Model

In [36]:
degree_range = range(2, 5)

for degree in degree_range:
    
    r2_train_scores = []
    r2_val_scores = []
    rmse_scores = []
    
    for train_index, val_index in kf.split(X_train_scaled): # Scaled?
        cv_X_train = X_train_scaled[train_index]
        cv_X_val   = X_train_scaled[val_index]
        cv_y_train = y_train[train_index]
        cv_y_val   = y_train[val_index]
        
        poly_features = PolynomialFeatures(degree=degree, include_bias=False)
        cv_X_train_poly = poly_features.fit_transform(cv_X_train)
        cv_X_val_poly = poly_features.fit_transform(cv_X_val)
        
        polyreg = LinearRegression().fit(cv_X_train_poly, cv_y_train)
        
        r2_train = polyreg.score(cv_X_train_poly, cv_y_train)
        r2_val = polyreg.score(cv_X_val_poly, cv_y_val)
         
        r2_train_scores.append(r2_train)
        r2_val_scores.append(r2_val)
        
        y_pred = polyreg.predict(cv_X_val_poly)
        rmse_scores.append(np.sqrt(metrics.mean_squared_error(cv_y_val, y_pred)))
    
    np.mean(r2_train_scores)
    np.mean(r2_val_scores)
    np.mean(rmse_scores)

## Model Evaluation

In [37]:
# Split X, y again

### Multivariate Linear Regression Model

In [49]:
linreg = LinearRegression().fit(X_train, y_train)

r2_train_score = linreg.score(X_train, y_train)
r2_test_score = linreg.score(X_test, y_test)

y_pred = linreg.predict(X_test)
rmse_score = np.sqrt(metrics.mean_squared_error(y_test, y_pred))

### Multivariate Linear Regression Model with Gradient Descent

In [95]:
learning_rate = 0.05
n_iterations = 10
n = len(cv_X_train)
theta = np.random.randn(8,1) # Replace 8 with features number
        
X_b = np.c_[np.ones((len(X_train), 1)), X_train] # Training
X_new_b = np.c_[np.ones((len(X_test), 1)), X_test] # Test
        
for iteration in range(n_iterations): # Train Model
    gradients = 2/n * X_b.T.dot(X_b.dot(theta) - y_train)
    theta = theta - learning_rate * gradients

y_pred = X_new_b.dot(theta) # Prediction of theta for each feature

#y_test_b = y_test
#y_test_b.reshape(409, 1)

rmse_score = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
#print(rmse_score)

ValueError: y_true and y_pred have different number of output (1!=1227)

In [64]:
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new] # add x0 = 1 to each instance
y_predict = X_new_b.dot(theta)
y_predict

TypeError: 'builtin_function_or_method' object is not subscriptable

### Ridge Regression Model with Feature Normalization

In [None]:
linridge = Ridge(alpha = alpha).fit(X_train_scaled, y_train)

### Lasso Regression Model with Feature Normalization

In [None]:
linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)

### Polynomial Regression Model

In [None]:
poly_features = PolynomialFeatures(degree=degree, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.fit_transform(X_test)
        
polyreg = LinearRegression().fit(X_train_poly, y_train)

In [None]:



# R^2

print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train)))
# Do testing too


# RMSE

print('RMSE: {:.3f}'.format(np.sqrt(metrics.mean_squared_error(y_test, y_pred))))

# Plot learning curves (training=validation RMSE plots)
