In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import required library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/openintro-possum/possum.csv')
df

In [None]:
df.info()
#some null values

## Basic EDA

- See age distribution per values in each categorical column
- See correlation of different numerical columns with age column

In [None]:
#categorical columns

cat = ['sex','Pop','site']

for col in cat:
    print(f'In {col}: {df[col].unique()}')

In [None]:
fig,ax=plt.subplots(3, figsize=(10,10))
ax=ax.ravel()

for index, col in enumerate(cat):
    sns.boxplot(x='age',y=col,data=df, ax=ax[index])

## Categorical findings

- female has higher median age than males

- Vic has more variations in age

- site 2 has significantly lower median age than other sites

In [None]:
df['site'] = df['site'].apply(lambda x:str(x))
df.info() #change site into string

In [None]:
fig,ax=plt.subplots(figsize=(15,15))
sns.heatmap(df.corr(),annot=True)
plt.show()
#to a limited degree, body dimensions are to some degree correlated with age

## Numerical findings

- to a limited degree, body dimensions are to some degree correlated with age

## Distribution, skewness & log-transform

- Some regression techniques don't work well skewed data, so we are doing this to detech is that's the case.  Therefore, we will inspect each numerical columns and see if log-transform is appropriate

- We can also experiment with log-transforming the outomce (age)

In [None]:
num = ['hdlngth',
 'skullw',
 'totlngth',
 'taill',
 'footlgth',
 'earconch',
 'eye',
 'chest',
 'belly']

#numerical columns EDA

fig,ax=plt.subplots(3,3, figsize=(10,10),constrained_layout=True)
ax=ax.ravel()

for index, col in enumerate(num):
    sns.histplot(x=col,data=df,ax=ax[index],
               kde=True)
    ax[index].set_title(f'Skewness:{df[col].skew(axis = 0)}')
    
#Some regression techniques don't work well skewed data, so we are doing this to detech is that's the case

In [None]:
num = ['hdlngth',
 'skullw',
 'totlngth',
 'taill',
 'footlgth',
 'earconch',
 'eye',
 'chest',
 'belly']

#numerical columns EDA

fig,ax=plt.subplots(3,3, figsize=(10,10),constrained_layout=True)
ax=ax.ravel()

for index, col in enumerate(num):
    log = (f'{col}_log')
    df[log] = df[col].apply(lambda x:np.log(x+1))
    sns.histplot(x=f'{col}_log',data=df,ax=ax[index],
               kde=True)
    ax[index].set_title(f'Skewness:{df[log].skew(axis = 0)}')
    


In [None]:
#target variable
sns.histplot(x='age',data=df,kde=True)
plt.title(f'Skewness:{df.age.skew(axis = 0)}')
plt.show()

In [None]:
#log transform
df['age_log'] = df['age'].apply(lambda x:np.log(x+1))
sns.histplot(x='age_log',data=df,kde=True)
plt.title(f'Skewness:{df.age_log.skew(axis = 0)}')
plt.show()

## Log-transform findings

- Skewness improves the best for skullw, taill, earconch, eye, chest

- We can inverse this later post-prediction with **np.exp(x) - 1**

In [None]:
df.head()

## Handling missing data

In [None]:
#handling missing date - dropping them since only a few rows are missing
df.dropna(axis=0,inplace=True)
df.info()

## All about Linear Regression

In the book "Introduction to Machine Learning", there are many techniques to improve a basic Linear Regression technique, including the following:

- comparing results with just log, no log and mix of log and no log on Linear Regression

- trying binning and discretization

- trying polynomial features

In the next section of the notebook, we will try each of the strategy and see how the Linear Regression performs

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
df.columns

In [None]:
#different column variations

X_log = ['site', 'Pop', 'sex','hdlngth_log','skullw_log', 'totlngth_log', 'taill_log', 'footlgth_log',
         'earconch_log', 'eye_log', 'chest_log', 'belly_log']

X_regular = ['site', 'Pop', 'sex', 'hdlngth', 'skullw', 'totlngth', 'taill',
       'footlgth', 'earconch', 'eye', 'chest', 'belly']

X_mix = ['skullw_log', 'taill_log', 'earconch_log', 'eye_log', 'chest_log','totlngth', 'taill',
       'footlgth','site', 'Pop', 'sex', 'belly']

y_regular = 'age'
y_log = 'age_log'

In [None]:
#base case
X = df[X_regular]
y = df[y_regular]

In [None]:
X = pd.get_dummies(X)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print(f'RMSE:{np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age:{df.age.std()}')

## Base Case results

- RMSE = 1.89

- We will explore how each strategy worsens or improves the model

In [None]:
#log
X = df[X_log].copy()
y = df[y_log].copy()

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

y_test = np.exp(y_test)-1
y_pred = np.exp(y_pred)-1

print(f'RMSE:{np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age:{df.age.std()}')

In [None]:
#mix
X = df[X_mix].copy()
y = df[y_log].copy()

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

y_test = np.exp(y_test)-1
y_pred = np.exp(y_pred)-1

print(f'RMSE:{np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age:{df.age.std()}')

In [None]:
#mix v2

X = df[X_mix].copy()
y = df[y_regular].copy()

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(f'RMSE:{np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age:{df.age.std()}')

## log vs not-log X

- So far, in this dataset, log does not help at all.  However, that may not be the case in other dataset.  

- Data science, IMO, is all about trial and error.  Considering the computing resources required by Linear Regression, feel free to recommend or try more combinations!

In [None]:
#recall that distribution of 'footlgth' & 'earconch' are like 2 bell curves joined together, let's see if binning helps with performance

## Trying binning

- recall that distribution of 'footlgth' & 'earconch' are like 2 bell curves joined together, let's see if binning helps with performance

In [None]:
num #columns of numerical values

In [None]:
#'footlgth', 'earconch' columns to be binned
X=df[X_regular].copy()
y=df[y_regular].copy()

In [None]:
#base case of 2 bins per column

to_bin = ['footlgth', 'earconch']

for col in to_bin:
    bins = np.linspace(X[col].min(),X[col].max(),2)
    foot_bin = np.digitize(X[col], bins=bins)
    X[f'{col}_binned'] = foot_bin
    X[f'{col}_binned'] = X[f'{col}_binned'].apply(lambda x:str(x))
    
    

In [None]:
X = pd.get_dummies(X)
X

In [None]:
X.drop(to_bin,axis=1,inplace=True) #dropping original columns

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(f'RMSE:{np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age:{df.age.std()}')

## Base-case binning results

- Base case of 2 bins do not provide much difference in performance

- Let's see if more bins can help.

In [None]:
bin_num = [2,3,4,5,6,7,8,9,10,11,12,13,14,15] #testing number of bins
to_bin = ['footlgth', 'earconch'] #columns to bin

for bin_n in bin_num:
    
    X=df[X_regular].copy()
    y=df[y_regular].copy()

    for col in to_bin:
        
        bins = np.linspace(X[col].min(),X[col].max(),bin_n)
        foot_bin = np.digitize(X[col], bins=bins)
        X[f'{col}_binned'] = foot_bin
        X[f'{col}_binned'] = X[f'{col}_binned'].apply(lambda x:str(x))
        
        
    X.drop(to_bin,axis=1,inplace=True) #dropping original columns
    X = pd.get_dummies(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)
    lr = LinearRegression()

    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)

    print(f'At bin = {bin_n}, RMSE:{np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'Standard Deviation of Age:{df.age.std()}\n')
    
#default RMSE:1.8887116003974755
#seems at bin = 4, we have the most improvements

## Binning conclusion

- Recall base-case default RMSE = 1.8887116003974755

- At bin = 4, we have the most improvements

- Binning is a powerful tool to improve performance of Linear Regression

## Binning and interactions

- In the above, we dropped the original data when we binned their columns

- However, we can include the original data back in after getting dummies

- Without adding back, we predict a value for each bin.  However, we may also want to capture the slope for each bin

- Besides adding back, we can compute a product of the bin dummies and the original data so we can capture a unique slope for each of the bin

In [None]:
#adding back

to_bin = ['footlgth', 'earconch']


X=df[X_regular].copy()
y=df[y_regular].copy()


for col in to_bin:

    bins = np.linspace(X[col].min(),X[col].max(),4)
    foot_bin = np.digitize(X[col], bins=bins)
    X[f'{col}_binned'] = foot_bin
    X[f'{col}_binned'] = X[f'{col}_binned'].apply(lambda x:str(x))

X = pd.get_dummies(X)

#don't drop to_bin this time

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age: {df.age.std()}\n')

#seems to perform worse

In [None]:
#product of the bin dummies and the original data 

to_bin = ['footlgth', 'earconch']


X=df[X_regular].copy()
y=df[y_regular].copy()


for col in to_bin:

    bins = np.linspace(X[col].min(),X[col].max(),4)
    foot_bin = np.digitize(X[col], bins=bins)
    X[f'{col}_binned'] = foot_bin
    X[f'{col}_binned'] = X[f'{col}_binned'].apply(lambda x:str(x))

X = pd.get_dummies(X)
X[['footlgth_binned_1', 'footlgth_binned_2', 'footlgth_binned_3',
    'footlgth_binned_4', 'earconch_binned_1', 'earconch_binned_2',
    'earconch_binned_3', 'earconch_binned_4']]

In [None]:
dummy = [['footlgth_binned_1', 'footlgth_binned_2', 'footlgth_binned_3',
    'footlgth_binned_4'], ['earconch_binned_1', 'earconch_binned_2',
    'earconch_binned_3', 'earconch_binned_4']]

original = ['footlgth', 'earconch']

for o in range(0,len(original)):
    for d in range(0,len(dummy[o])):
        col_name = f'{original[o]}*{[dummy[o][d]]}'
        X[col_name] = (X[original[o]]*X[dummy[o][d]])

In [None]:
X.drop(original,axis=1,inplace=True)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age: {df.age.std()}\n')



## Interaction Findings

- Just adding back does not improve performance, worsens it in fact

- However, using a product of original data and dummy variables do improve upon baseline model.  However, not to the degree of just binning

## Polynomial Features

- we can also use polynomial to expand the features we have.  Let's see if that improves the performance

In [None]:
#given what we've learned so far, use bin 4 with dropping original data going forward

to_bin = ['footlgth', 'earconch']


X=df[X_regular].copy()
y=df[y_regular].copy()


for col in to_bin:

    bins = np.linspace(X[col].min(),X[col].max(),4)
    foot_bin = np.digitize(X[col], bins=bins)
    X[f'{col}_binned'] = foot_bin
    X[f'{col}_binned'] = X[f'{col}_binned'].apply(lambda x:str(x))
    
X.drop(to_bin,axis=1,inplace=True)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

#baseline = x^(2)
poly = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
X

In [None]:
to_transform = ['hdlngth',
 'skullw',
 'totlngth',
 'taill',
 'eye',
 'chest',
 'belly']

poly.fit(X[to_transform])
X_poly = pd.DataFrame(poly.transform(X[to_transform]),
                      columns=poly.get_feature_names(X[to_transform].columns))
X_poly

In [None]:
# adding back categorical data from before

add_back = ['site','Pop','sex','footlgth_binned','earconch_binned']
X_poly[add_back] = X[add_back]
X_poly

In [None]:
X_poly = pd.get_dummies(X_poly) 

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, 
                                                test_size=0.33, random_state=42)
lr = LinearRegression()

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'Standard Deviation of Age: {df.age.std()}\n')

In [None]:
degrees = [2,3,4,5,6,7,8,9,10]

to_transform = ['hdlngth',
                'skullw',
                'totlngth',
                'taill',
                'eye',
                'chest',
                'belly']

add_back = ['site','Pop','sex','footlgth_binned','earconch_binned']

X_preprocessed = X.copy() #with the bins included

for d in degrees:
    
    X_preprocessed = X.copy() #with the bins included
    
    poly = PolynomialFeatures(degree=d, include_bias=False)
    
    poly.fit(X_preprocessed[to_transform])
    
    X_poly = pd.DataFrame(poly.transform(X_preprocessed[to_transform]),
                          columns=poly.get_feature_names(X_preprocessed[to_transform].columns))
    
    X_poly = pd.get_dummies(X_poly) 

    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, 
                                                    test_size=0.33, random_state=42)
    lr = LinearRegression()

    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)

    print(f'At degree = {d}, RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    print(f'Standard Deviation of Age: {df.age.std()}\n')

## Polynomial Findings

- In this case, adding features that scale up the original data by degrees do not seem to help with performance.  However, that may not be the case for other dataset

## Conclusion

**In this notebook, we have done:**

- Implementining a basic Linear Regression to predict Age of possums

- Using binning, binning-interaction and polynomials techniques to improve upon a basic Linear Regression technique

- We learned that binning is a powerful technique to boost performance for Linear Regression in this dataset

- Most of the techniques in the notebook have been inspired by the book *"Introduction to Machine Learning" by Andreas C. Müller and Sarah Guido* 