## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline
# option so we can always see all columns
pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv('unit4_partially_processed.csv') 
data.head()

### Feature engineering - a few more examples

Add your notes here : 



In [None]:
len(data[data['DOB'] == 0])

NOTE : In the DOB column, the format is YYMM. We will use this info to get the year when they were born.

In [None]:
def year(x):
  x =str(x)
  if (len(x) < 4):
    return np.nan
  else:
    return int(x[:2])

In [None]:
import re
def year(x):
    x = str(x)
    if len(x)<4:
        return np.nan
    else:
        pattern = '\d\d'
        yr = re.findall(pattern,x)[0]
        return int(yr)

NOTE : our reference year is 1997 because the data is from a 1997 study.

In [None]:
data['year'] = list(map(year, data['DOB']))
data['age'] = 97 - data['year']

data['age'] = data['age'].fillna(np.mean(data['age']))

In [None]:
# Now we can drop the column DOB as we have extracted the information we need from this column
data = data.drop(['DOB'], axis=1)
data = data.drop(['year'], axis=1)

In [None]:
data.head()

### Feature selection- the challenge : which columns should we keep for our model?

- tip 1 - check null values to drop a column 
- tip 2 - sense check which columns are likely to be useful to answer the question at hand... 
- tip 3 - check multicollinearity for numerical variables using correlation/ heat maps etc 

Things you can do : (your notes here)

- 
- 
- 





In [None]:
# corrrelation matrix 
data_corr = data[['INCOME', 'HV1', 'HV2', 'IC1', 'IC2', 'IC3', 'IC4', 'NUMPROM', 'CARDPROM', 'NGIFTALL', 'TIMELAG', 'AVGGIFT']]
corr_matrix=data_corr.corr(method='pearson')
fig, ax = plt.subplots(figsize=(10, 8))
ax = sns.heatmap(corr_matrix, annot=True)
plt.show()

### why is multi collinearity a problem for linear regression ? 

Your notes here : 

- 
- 
- 


### How does the correlation matrix help ? 

Your notes here : 

- 
- 
- 



In [None]:
corr_matrix # numeric less colourful version

### what can you see? do we need to investigate further ? 

what can we do to confirm the correlation? 

your notes here : 

- 
- 
- 


In [None]:
sns.jointplot(x="HV1", y="HV2", data=data_corr, kind='reg',joint_kws={'line_kws':{'color':'black'}})


In [None]:
sns.jointplot(x="HV1", y="IC1", data=data_corr, kind='reg',joint_kws={'line_kws':{'color':'black'}})

In [None]:
sns.jointplot(x="HV1", y="IC2", data=data_corr, kind='reg',joint_kws={'line_kws':{'color':'black'}})

In [None]:
sns.jointplot(x="HV1", y="IC3", data=data_corr, kind='reg',joint_kws={'line_kws':{'color':'black'}})

In [None]:
sns.jointplot(x="HV1", y="IC4", data=data_corr, kind='reg',joint_kws={'line_kws':{'color':'black'}})

In [None]:
sns.jointplot(x="IC1", y="IC2", data=data_corr, kind='reg',joint_kws={'line_kws':{'color':'black'}})

### what if anything has this revealed ? Can we drop some features at this stage? 

your notes here : 

- 
- 
- 



#### what about calculating a line of best fit using linear regression, between two variables ? 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



### what is r2 score? and can I 100% rely on it?
https://blog.minitab.com/blog/adventures-in-statistics-2/regression-analysis-how-do-i-interpret-r-squared-and-assess-the-goodness-of-fit

your notes here : 

- 
- 
- 



### what is VIF? 
https://www.statisticshowto.com/variance-inflation-factor/

your notes here : 

- 
- 
- 



In [None]:
model = LinearRegression().fit(data_corr[['HV1']], data_corr[['HV2']])
model_r2 = model.score(data_corr[['HV1']], data_corr[['HV2']])
model_vif = 1/(1-model_r2)
print("The R2 of the model is: %5.3f" % (model_r2))
print("The VIF of the model is: %5.3f" % (model_vif))

In [None]:
model = LinearRegression().fit(data_corr[['IC1']], data_corr[['IC2']])
model_r2 = model.score(data_corr[['IC1']], data_corr[['IC2']])
model_vif = 1/(1-model_r2)
print("The R2 of the model is: %5.3f" % (model_r2))
print("The VIF of the model is: %5.3f" % (model_vif))

#### Let's see how we can automate some feature selection based on VIF.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [None]:
data_corr.shape

NOTE: before using these techniques you must ensure there are no Nulls or NANs in my data - as these functions cannot handle missing values 

In [None]:
data_corr.isnull().sum().sort_values(ascending=False)

As we can see there are loads of missing values in timelag. we have been through various methods to interpolate, predict, or drop these values already. 
Lets use the simple replace with mean technique for this exercise

In [None]:
data_corr.fillna(data_corr.mean(), inplace=True)


In [None]:
#remember to check this has worked as expected 

In [None]:
# VIF function: 
# We are using add_constant here as the VIF method in python expects the addition of a constant terms in the X features.
# (namely the intercept term). It uses OLS but does not add the constant itself. 

vif = {}
data_corr = add_constant(data_corr)
#data_corr

for i in np.arange(data_corr.shape[1]):
    column_name = data_corr.columns[i]
    value = variance_inflation_factor(np.array(data_corr), i)
    vif[column_name] = value


In [None]:
# Code to use the variance_inflation_factor technique to remove highly correlated columns

flag = True
threshold = 50
data_corr = add_constant(data_corr)
while flag is True:
    #print(data_corr.head())
    flag = False
    values = [variance_inflation_factor(np.array(data_corr), i) for i in np.arange(data_corr.shape[1])]
    #print(values)
    if max(values)> threshold:
        col_index = values.index(max(values))
        column_name = data_corr.columns[col_index]
        data_corr = data_corr.drop([column_name], axis=1)
        flag = True

#### Summary re effects of Multicollinearity:

Your Notes here 

-

-

-



# Activity

Comparing the VIF method and the correlation matrix to perform feature selection.

Do you think it is better to use VIF or correlation matrix threshold for feature selection? 

In [None]:
# using corr

flag= True
while flag is True:
    flag = False
    for i in range(1, corr_matrix.shape[1]):
        if corr_matrix.iloc[i,range(i)].max() > 0.9:
            print(corr_matrix.columns[corr_matrix.iloc[i,range(i)].argmax()])
            col_name = corr_matrix.columns[corr_matrix.iloc[i,range(i)].argmax()]
            corr_matrix.drop([col_name], axis=1, inplace=True)
            corr_matrix.drop([col_name], inplace=True)

            flag = True
            break

print(corr_matrix.columns)

In [None]:
# using VIF

flag = True
threshold = 50
data_corr = add_constant(data_corr)
while flag is True:
    #print(data_corr.head())
    flag = False
    values = [variance_inflation_factor(np.array(data_corr), i) for i in np.arange(data_corr.shape[1])]
    #print(values)
    if max(values)> threshold:
        col_index = values.index(max(values))
        column_name = data_corr.columns[col_index]
        data_corr = data_corr.drop([column_name], axis=1)
        flag = True

print(data_corr.columns)

### Chi-square tests for independence of categorical variables

Your notes here : 

- 
-
-
 

https://towardsdatascience.com/chi-square-test-for-feature-selection-in-machine-learning-206b1f0b8223

In [None]:
data_crosstab = pd.crosstab(data['DOMAIN'], data['RFA_2'], margins = False)
data_crosstab

What did the contingency table show ? 

your notes here 

- 
-
- 

H0 (Null Hypothesis) - assumes that there is no association between the two variables.

Ha (Alternate Hypothesis) - assumes that there is an association between the two variables. 

#### What are the three important values that we measure in order to calculate the Chi-square test statistic?

Your notes here: 
    
    - 
    - 
    - 

In [None]:
from scipy.stats import chi2_contingency
chi2_contingency(data_crosstab, correction=False)


#### What did you learn using the chi squared test and are there any further columns/features we could drop at this stage ? 

Your notes here : 

-
-
-