**PACKAGES**
> 
We will first import the packages nessary to develop and run our models

In [39]:
# Basic data manipulation 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 

# Data processing packages
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

# Random Forest packages
from sklearn.ensemble import RandomForestClassifier

# Boosting packages
from sklearn.ensemble import GradientBoostingClassifier


**IMPORTING DATA**
> 
We can now begin to import our data from the csv files

In [40]:
DATA_PATH = "../input/icr-identify-age-related-conditions/"

# Read data in from files
# Importing data set
train_data = pd.read_csv(DATA_PATH + "train.csv") 
X = train_data.loc[:, train_data.columns != "Class"] # Data Inputs (Readings)
y = train_data.loc[:, "Class"] # Data Outputs (Class labels)

**INITAL OBSERVATIONS**

We can look at the data to check what the data types and format of the data is. 

In [41]:
print(X.head()) 
# All data other than Id and EJ is numerical

print(f"Null values in X: {X.isnull().sum().sum()}")
# There are a few null values which we will have to check out
print(f"Null values in y: {y.isnull().sum()}")
# No null values in our lables which is good

             Id        AB          AF          AH         AM        AR  \
0  000ff2bfdfe9  0.209377  3109.03329   85.200147  22.394407  8.138688   
1  007255e47698  0.145282   978.76416   85.200147  36.968889  8.138688   
2  013f2bd269f5  0.470030  2635.10654   85.200147  32.360553  8.138688   
3  043ac50845d5  0.252107  3819.65177  120.201618  77.112203  8.138688   
4  044fb8a146ec  0.380297  3733.04844   85.200147  14.103738  8.138688   

         AX        AY         AZ          BC  ...         FI        FL  \
0  0.699861  0.025578   9.812214    5.555634  ...   3.583450  7.298162   
1  3.632190  0.025578  13.517790    1.229900  ...  10.358927  0.173229   
2  6.732840  0.025578  12.824570    1.229900  ...  11.626917  7.709560   
3  3.685344  0.025578  11.053708    1.229900  ...  14.852022  6.122162   
4  3.942255  0.054810   3.396778  102.151980  ...  13.666727  8.153058   

         FR        FS         GB          GE            GF         GH  \
0   1.73855  0.094822  11.339138   72

**PRE-PROCESSING DATA**
> 
We will first remove the ID column as that will not help us in any of our predictions as it is not related to the task

We will next imput the null values with the mean of that column


In [42]:
def process_data(data):
    # Drop ID column
    data = data.drop("Id", axis=1)

    # We can observe that EJ is catagorical but only takes two values
    # A or B so we will set WLOG A to 1 and B to 0
    #print(data["EJ"].head())
    data["EJ"] = data["EJ"].map({'A': 1, 'B': 0})
    #print(data["EJ"].head())

    # Imput Null values with mean
    data = data.fillna(data.mean(numeric_only=True))
    #print(f"Null values in X: {data.isnull().sum().sum()}")
    
    return data

X = process_data(X)

**TRAIN TEST SPLIT**

We will split the data in a training set and a hold out set for future validation

In [43]:
# split the data into a training and hold out set
X_train, X_ho, y_train, y_ho = train_test_split(X, y, test_size=0.3)

**CREATE RANDOM FOREST MODEL**

We will create simple random forest model to predict the data then compare our predictions with the hold out data set

In [44]:
# Construct the model
model = RandomForestClassifier(n_estimators=300)  
model.fit(X_train, y_train)

y_pred_probs = model.predict_proba(X_ho)
y_pred = np.argmax(y_pred_probs,axis = 1)

accuracy = accuracy_score(y_ho, y_pred)
print("Accuracy against Hold Out set:", accuracy)

Accuracy against Hold Out set: 0.9086021505376344


**CREATE GRADIENT BOOST MODEL**

We will create a simple boosting model

In [45]:
# Construct the model
model = GradientBoostingClassifier(n_estimators=300)  
model.fit(X_train, y_train)

y_pred_probs = model.predict_proba(X_ho)
y_pred = np.argmax(y_pred_probs,axis = 1)

accuracy = accuracy_score(y_ho, y_pred)
print("Accuracy against Hold Out set:", accuracy)

Accuracy against Hold Out set: 0.8924731182795699


**K FOLD CROSS VALIDATION**

We will do the same as we just did, however will apply futher testing to see which model will preform the best overall not just on the given split

In [46]:
# Import k fold cross validation modules
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k = 10  # Number of folds
kfold = KFold(n_splits=k, shuffle=True)  # Create KFold object

# Test gradient boosting model
model = GradientBoostingClassifier(n_estimators=300)  
scores = cross_val_score(model, X, y, cv=kfold)
print("Accuracy with Boositing:", scores.mean())

# Test gradient boosting model
model = RandomForestClassifier(n_estimators=300)  
scores = cross_val_score(model, X, y, cv=kfold)
print("Accuracy with Random Forests:", scores.mean())

Accuracy with Boositing: 0.9352987837123216
Accuracy with Random Forests: 0.923823373876256


**REFINING RANDOM FORESTS**

We see that random forests are our best model so we will refine the hyper parameters, namly the depth of tree


In [47]:
def best_rf_model():
    depths = range(8, 18)
    max_score = 0
    for depth in depths:
        model = RandomForestClassifier(n_estimators=300, max_depth = depth)  
        scores = cross_val_score(model, X, y, cv=kfold)

        # If found new best
        if scores.mean() > max_score:
            max_score = scores.mean()
            best_depth = depth
            print(f"NEW Best Depth: {best_depth}, with Accuracy: {max_score}")

    print(f"FINAL Best Depth: {best_depth}, with Accuracy: {max_score}")
    return RandomForestClassifier(n_estimators=1000, max_depth = best_depth)  

**PREDICT TEST DATA**

We will now predict the unknown test data with this model

In [48]:
# Find best model to use
model = best_rf_model()
model.fit(X,y)

# Import the data to predict
X_test = pd.read_csv(DATA_PATH + "test.csv")

ids = X_test.loc[:, "Id"]
# Do data processing
X_test = process_data(X_test)

# predict results
y_test_probs = model.predict_proba(X_test)
print(y_test_probs)

submissions = pd.read_csv(DATA_PATH + "sample_submission.csv")
# We will change the Sample Submission value
submissions[['class_0', 'class_1']] = y_test_probs

submissions.to_csv("submission.csv", index=False)

NEW Best Depth: 8, with Accuracy: 0.9142252776308831
NEW Best Depth: 9, with Accuracy: 0.9204918032786885
NEW Best Depth: 12, with Accuracy: 0.9286620835536754
FINAL Best Depth: 12, with Accuracy: 0.9286620835536754
[[0.523 0.477]
 [0.523 0.477]
 [0.523 0.477]
 [0.523 0.477]
 [0.523 0.477]]
