# Feature Engineering

In [202]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [203]:
df=pd.read_csv('../input/dementia-prediction-dataset/dementia_dataset.csv')
df.head()

In [204]:
df.shape

#### ENCODING CATEGORICAL DATA

#### Two categorical columns: M/F and Group.

In [205]:
# from sklearn.preprocessing import OneHotEncoder

# # Create instance of OneHotEncoder.
# ohe = OneHotEncoder()

# # Perform OneHotEncoding on the Group column.
# df_ohe1 = pd.DataFrame(ohe.fit_transform(df[['Group']]).toarray(), columns=ohe.categories_)

# # Merge OneHotEncoded column with df dataframe.
# df2 = df.join(df_ohe1)

# # View dataframe.
# df2.head()

In [206]:
# # Perform OneHotEncoding on the M/F column.
# df_ohe2 = pd.DataFrame(ohe.fit_transform(df[['M/F']]).toarray(), columns=ohe.categories_)

# # Merge OneHotEncoded column with dataframe.
# df_final = df2.join(df_ohe2)

# # View final dataframe.
# # df_final

In [207]:
# df = df_final

In [208]:
df=pd.concat([df,pd.get_dummies(df['Group'])],axis=1)
df=pd.concat([df,pd.get_dummies(df['M/F'])],axis=1)

#### We can decide to drop the Group and M/F columns since they have been encoded.

In [209]:
df.drop(['Group', 'M/F'], axis=1, inplace=True)
df

For __Feature Engineering__ the __Train data__ should only be taken into consideration for the statistics value and 

then these values will be then used for both train and test. This is the best and optimal way to prevent the 

overfitting later in Model Building part.

In [210]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(df.drop('CDR' , axis=1) , df['CDR'] , test_size=0.2 , 
                                                      random_state=2, shuffle = True)
#I added shuffle

In [211]:
X_train.shape , X_test.shape

### 1) Missing Value Imputation
- by Median Imputation method

In [212]:
missing_variables = [feature for feature in df.columns if df[feature].isnull().sum()>0]
X_train[missing_variables].isnull().mean()

So SES has 5% of missing values and MMSE has .05% of missign values

In [213]:
median_SES = X_train['SES'].median()
X_train['SES_imputed'] = X_train['SES'].fillna(median_SES)
X_test['SES_imputed'] = X_test['SES'].fillna(median_SES)

In [214]:
median_MMSE = df['MMSE'].median()
X_train['MMSE_imputed'] = df['MMSE'].fillna(median_MMSE)
X_test['MMSE_imputed'] = X_test['MMSE'].fillna(median_SES)

2 New variables Created for the analysis and it later will be removed

In [215]:
X_train.head()

In [216]:
X_test.head()

__Cheking out the distribution after imputation__

In [217]:
fig , ax = plt.subplots(figsize=(8,5))
ax2=ax.twinx()

X_train['SES'].plot.density(color='red' , label='Original Distribution')

X_train['SES_imputed'].plot.density(color='blue' , label='Imputed Distribution')

plt.legend()
plt.title('SES')
plt.show()

In [218]:
fig , ax = plt.subplots(figsize=(8,5))
ax2=ax.twinx()

X_train['MMSE'].plot.density(color='red' , label='Original Distribution')

X_train['MMSE_imputed'].plot.density(color='blue' , label='Imputed Distribution')

plt.legend()
plt.title('MMSE')
plt.show()

We can see the there is the distortion of the original varible distribution in Variable `SSE` as the missing data is 5%.

But not in the `MMSE` variable

__Variance__

In [219]:
X_train['SES'].var() , X_train['SES_imputed'].var()  
# so we can also see the change in the variance

In [220]:
X_train['MMSE'].var() , X_train['MMSE_imputed'].var()  
# almost same 

__Covariance__

In [221]:
X_train.cov()

We can see that Variable covariance is also changed for the `SES` variable but for the `MMSE` is almost same there is no change

So after all the inferences derived after the imputation . We come to a conclusion to make the additional missing indicator for the variable `SES`

In [222]:
# 1 for missing and 0 for not missing

X_train['missing_indicator_SES'] = np.where(X_train['SES'].isnull() , 1 , 0)  
X_test['missing_indicator_SES'] = np.where(X_test['SES'].isnull() , 1 , 0)

In [223]:
X_train.head()

In [224]:
X_test.head()

In [225]:
X_train.drop(['SES' , 'MMSE'] , axis=1 , inplace=True)
X_test.drop(['SES' , 'MMSE'] , axis=1 , inplace=True)

In [226]:
X_train.head()

In [227]:
X_test.head()

In [228]:
X_train.isnull().sum()

In [229]:
X_test.isnull().sum()

__So there are now no missing data in the dataset__

In [230]:
# We changed the 4 class to 0,1,2,an 3 so that the model don't see it as a continous value
y_testt=y_test.replace({0.0:0, 0.5:1, 1.0:2, 2.0:2}).astype('int')
y_trainn=y_train.replace({0.0:0, 0.5:1, 1.0:2, 2.0:2}).astype('int')



y_test2=y_test.replace({0.0:0, 0.5:1, 1.0:1, 2.0:1}).astype('int')
y_train2=y_train.replace({0.0:0, 0.5:1, 1.0:1, 2.0:1}).astype('int')

In [231]:
# We drop the columns that are not useful to the model
X_train.drop(['Subject ID', 'MRI ID', 'Hand'], axis=1, inplace=True)
X_test.drop(['Subject ID', 'MRI ID', 'Hand'], axis=1, inplace=True)

In [232]:
# # We can use this binary classification to predict the probability of 
# # a patient have dementia

# def proba(n):
#     if n == 0:
#         return 0
#     else:
#         return 1


# df['yy'] = df['CDR'].apply(lambda x: proba(x))

## Modeling

In [233]:
#Function for creating a dataframe based on order of importance of features
def feat_imp_sorter(model, x):
    warnings.simplefilter(action='ignore', category=FutureWarning)
    feature_imp = pd.DataFrame(zip(model.feature_importances_,x.columns), columns=['Value','Features'])
    feature_imp.sort_values(by='Value',ignore_index=True, ascending=False, inplace = True)
    return feature_imp
    
#fUnction for ploting barplot for the feature importance of any model    
def feat_imp_ploter(model,x):
    sorted_feat = feat_imp_sorter(model,x)
#     print(sorted_feat.shape)
    
    plt.figure(figsize=(5, 5))
    sns.barplot(x="Value", y="Features", data=sorted_feat)
    plt.title(str(model) +'Features_important')
    plt.tight_layout()


In [234]:
from sklearn.metrics import f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

### Base_Model: LogisticRegression

In [235]:
base_model = LogisticRegression(solver='liblinear')
base_model.fit(X_train,y_trainn)

print(f1_score(y_testt,base_model.predict(X_test), average='weighted'))

In [236]:
print(classification_report(y_testt, base_model.predict(X_test)))

### GradientBoostingClassifier

Fitting the model and evaluating the model using *F1 score*. F1 score was chosen as the metrics because it takes into consideration *PRECISION* and *RECALL*

In [237]:
gbt = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=0.5,
                           max_depth=4,random_state = 7)
gbt.fit(X_train,y_trainn)

print(f1_score(y_testt,gbt.predict(X_test), average='weighted'))

An overall report of the performance of the model

In [238]:
print(classification_report(y_testt, gbt.predict(X_test)))

In [239]:
gbt.predict(X_test)

From the result above Gradient boosting classifier did well having a F1 score of 93 and it is also noticed that no class 3 was predicted by the model. This can be as a result of the size of the data and the little representation of the class compared to others

In [240]:
feat_imp_ploter(gbt, X_train)

According to the barplot based on Feature Importance *Nondemented* has the highest influence on the predictions of the model, which is obviously supposed to be. As well this might be considered a data leakage

### RandomForestClassifier

In [241]:
rfc = RandomForestClassifier(n_estimators=100,
                           max_depth=7,random_state = 7)
rfc.fit(X_train,y_trainn)

print(f1_score(y_testt,rfc.predict(X_test), average='weighted'))

An overall report of the performance of the model

In [242]:
print(classification_report(y_testt, rfc.predict(X_test)))

In [243]:
rfc.predict(X_test)

From the result above report RandomForest classifier have a F1 score of 89 and it is also noticed that no class 3 was predicted by the model as well.

In [244]:
feat_imp_ploter(gbt, X_train)

According to the barplot based on Feature Importance *Nondemented* and *Demented* have higher influence on the predictions of the model. It can also be seen that the Gender *M*, *F* bare have any influence on the model's decisions. Therefore it can be considered dropping them to see how the model performs.

### DecisionTreeClassifier

In [245]:
dtc = DecisionTreeClassifier(max_depth=3,random_state = 7)#I tried different max_depth 10, 9, 7,3

dtc.fit(X_train,y_trainn)

print(f1_score(y_testt,dtc.predict(X_test), average='weighted'))

An overall report of the performance of the model

In [246]:
print(classification_report(y_testt, dtc.predict(X_test)))

In [247]:
dtc.predict(X_test)

From the result above report DecisionTreeClassifier have a F1 score of 91 and no class 3 was predicted. Its results is almost same as GradientBoosting.

In [248]:
feat_imp_ploter(dtc, X_train)

Wow, Here the barplot speaks for itself concerning the *Feature Importance* of the model

Other things we might consider:
   - normalising the data although convectionally it has no/little effect on tree model
   - we might consider dropping some features e.g M, F
   - we can also do little hyperparameter tuning
   - we can also use different models like Ada, Catboost, Xgboost but i feel they might be heavy for our little dataset
   - also if time permits try some feature combination...

## Predicting Probability of being dementia

we will be using binary classification predict_proba and we will go with the model with the best performance (gb)

In [249]:
gbt.fit(X_train, y_train2) # this  y_train2 is the second target variable for the binary classification
gbt.predict_proba(X_test)

Since our target is (0,1), then the classifier output a probability matrix of dimension (N,2). The first index refers to the probability that the data belong to class 0 (Normal), and the second refers to the probability that the data belong to class 1 (Dementia).

These two would sum to 1.

We can then output the result by:

In [250]:
gbt.predict_proba(X_test)[:,1] # probability that each of the pateint has dementia(1)