# Template Regression

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

random_state = 67
np.random.seed(random_state)

## Data Exploration

In [None]:
# Read the data
url = ''
df = pd.read_csv(url)

In [None]:
# Check the first 5 rows of the dataset
df.head()

In [None]:
# explore the distribution of the target variable
# count help to see if there are some missing values
df.describe()

In [None]:
#n rows with missing values
df.shape[0]-df.dropna().shape[0]

In [None]:
# Count the number of missing values per columns
df.isna().sum()

In [None]:
# visualize the distribution of the features
# check for outliers and different scales of the features
df.boxplot(figsize=(15,10))
plt.show()

In [None]:
# Show feature correlation, is expecially usefull see the correlation with the target feature
import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), annot=True)

## Preprocessing

In [None]:
# Drop Null rows
df = df.dropna()

In [None]:
# (OPTIONAL) If there is a string variable, we need to encode it to numerical values 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
column_to_transform = ''
df[column_to_transform] = le.fit_transform(df[column_to_transform].values)

In [None]:
# (OPTIONAL) use this to convert nominal labels to numerical values
from sklearn.preprocessing import OneHotEncoder
one = OneHotEncoder()
column_to_transform = 'exemple_column'
enc_data = one.fit_transform(df[column_to_transform].values)
l = list(one.categories_[0])
enc_df = pd.DataFrame(enc_data.toarray(),columns=l)
df = df.join(enc_df)
df = df.drop([column_to_transform],axis=1)
df.head()

In [None]:
# (OPTIONAL) use this to convert ordinal labels to numerical values
from sklearn.preprocessing import OrdinalEncoder
categories = ['bad','good','very good'] # exemple of ordinal categories
oe = OrdinalEncoder(categories=categories,dtype=int)
column_to_transform = 'col_name'
df[column_to_transform] = oe.fit_transform(df[column_to_transform].values)


In [None]:
# Change the ranges of the features to be between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_processed = pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

In [None]:
# Data standardization
from sklearn.preprocessing import PowerTransformer,StandardScaler
from sklearn.pipeline import make_pipeline
preprocessor = make_pipeline(PowerTransformer(),StandardScaler())
df_processed = pd.DataFrame(preprocessor.fit_transform(df),columns=df.columns)

In [None]:
target = ''
X = df.drop(target,axis=1)
y = df[target]

## Training

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=random_state, train_size=0.7 )
print(f'train size {X_train.shape[0]}, test size {X_test.shape[0]}')

In [None]:
results = pd.DataFrame(columns=['Model','RMSE','R_square'])

### Univariate Linear Regressor

In [None]:
# Choose the feature that has the hiest correlation with the target
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,root_mean_squared_error

feature_selected = 'F1'
X_train_univariate = X_train[feature_selected].values.reshape(-1,1)
X_test_univariate = X_test[feature_selected].values.reshape(-1,1)

lr = LinearRegression()
lr.fit(X_train_univariate,y_train)
y_pred_univariate = lr.predict(X_test_univariate)

# regression function
coeff_univariate = lr.coef_[0] # Coefficient of the feature
intercept_univariate = lr.intercept_ # Bias

results.loc[len(results)] = [
    f'Linear Univariate on {feature_selected}',
    root_mean_squared_error(y_test,y_pred_univariate),
    r2_score(y_test,y_pred_univariate)
]

### Multivariate Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred_multivariate = lr.predict(X_test)

# regression function
coeff_multivariate = lr.coef_ # Coefficient of the feature
intercept_multivariate = lr.intercept_ # Bias

results.loc[len(results)] = [
    f'Linear Multivariate',
    mean_squared_error(y_test,y_pred_multivariate),
    r2_score(y_test,y_pred_multivariate)
]

### Decision Tree Regressor

In [None]:
# Find the Max Depth of the tree by running the algorithm normally

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=random_state)
dt.fit(X_train,y_train)
max_depth = dt.tree_.max_dept
print("The maximum depth of the full Decision Tree Regressor is {}".format(max_depth))

In [None]:
# Now Search the Best tree depth with cross validation
from sklearn.model_selection import GridSearchCV

paramGrid = {'max_depth':[*range(1,max_depth+1)]}

dt_gscv = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=random_state),
    param_grid=paramGrid,
    scoring='neg_mean_squared_error'
)
dt_gscv.fit(X_train,y_train)
dt_best = dt_gscv.best_estimator_
best_max_depth = dt_best.tree_.max_dept
print("The optimal maximum depth for the decision tree is {}".format(best_max_depth))

y_pred_dt = dt_best.predict(X_test)
results.loc[len(results)] = [
    f'Decision Tree',
    mean_squared_error(y_test,y_pred_dt),
    r2_score(y_test,y_pred_dt)
]

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=random_state)
param_grid_rf = {'max_depth':list(range(1,max_depth+1))}

rf_gscv = GridSearchCV(
    rf,
    param_grid=param_grid_rf,
    scoring = 'neg_mean_squared_error'
)
rf_gscv.fit(X_train,y_train)
rf = rf_gscv.best_estimator_

y_pred_rf = rf.predict(X_test)
results.loc[len(results)] = [
    f'Random Forest',
    mean_squared_error(y_test,y_pred_rf),
    r2_score(y_test,y_pred_rf)
]

### Polynomial

In [None]:
from sklearn.preprocessing import PolynomialFeatures

degrees = list(range(2,5))
for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    lr_poly = LinearRegression()
    lr_poly.fit(X_train_poly,y_train)
    y_pred_poly = lr_poly.predict(X_test_poly)
    
    results.loc[len(results)] = [
        f'Polynomial Regression degree {degree}',
        mean_squared_error(y_test,y_pred_poly),
        r2_score(y_test,y_pred_poly)
    ]

## Display Result

In [None]:
# Plot the Decision Tree

from sklearn.tree import plot_tree
from matplotlib.pyplot import figure

figure(figsize = (20,15))
plot_tree(dt_best,
          feature_names=X.columns.to_list(),
          filled=True
          )

In [None]:
# Plot
# we can plot only one feature, so we choose the one with the highest correlation with the target
feature_to_plot = 'F1'

# we need to sort the feature for incrising value to have a nice plot.
# and for that reason when need to sort also the corresponding predictions.
plot_df = pd.DataFrame()
plot_df[feature_to_plot] = X_test[feature_to_plot]
plot_df['y_test'] = y_test
plot_df['y_pred_univariate'] = y_pred_univariate
plot_df['y_pred_multivariate'] = y_pred_multivariate
plot_df['y_pred_dt'] = y_pred_dt
plot_df['y_pred_rf'] = y_pred_rf
plot_df = plot_df.sort_values(by=feature_to_plot)

# Plot the true values and the predictions of the different models
plt.figure(figsize=(10,7))
plt.plot(plot_df[feature_to_plot],plot_df['y_test'],label='True Values',color='black')
plt.plot(plot_df[feature_to_plot],plot_df['y_pred_univariate'],label='Linear Univariate',color='blue')
plt.plot(plot_df[feature_to_plot],plot_df['y_pred_multivariate'],label='Linear Multivariate',color='orange')
plt.plot(plot_df[feature_to_plot],plot_df['y_pred_dt'],label='Decision Tree',color='green')
plt.plot(plot_df[feature_to_plot],plot_df['y_pred_rf'],label='Random Forest',color='red')
plt.xlabel(feature_to_plot)
plt.ylabel('Target')
plt.legend()
plt.title('True Values vs Predictions')
plt.show()


Other accuracy measures: F_test, P score

In [None]:
# the professor did this in lab but never explained F_test in therory only P Score
import scipy.stats

# Exemple of callinf f_test ->  f_test(y, y_pred, X.shape[1], X.shape[0])
def f_test(y_true, y_pred, n_var, n_obs):
    """ Computation of F-statistic and p-value for the regression
    http://facweb.cs.depaul.edu/sjost/csc423/documents/f-test-reg.htm
    Requires: np (numpy) and scipy.stats

    Arguments:
    y_true: ground truth
    y_pred: predictions
    n_var: number of predicting variables
    n_obs: number of observations (the length of y_true and y_pred)

    Returns:
    F: F statistics
    p: p-value
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    n = n_obs
    p = n_var+1 # number of regression parameters (coefficients + intercept)
    y_true_m = np.mean(y_true)
    SSM = np.sum((y_pred-y_true_m)**2)
    SSE = np.sum((y_true-y_pred)**2)
    DFM = p - 1 # degrees of freedom for model - numerator
    DFE = n - p # degrees of freedom for error - denominator
    MSM = SSM / DFM
    MSE = SSE / DFE
    F = MSM / MSE
    # f = np.var(x, ddof=1)/np.var(y, ddof=1) #calculate F test statistic
    p = 1-scipy.stats.f.cdf(F, DFM, DFE) #find p-value of F test statistic
    return F, p