**ACCOUNTING FOR META DATA**

We will now account for the second data file containing more infomation on what condition each patient has

**PACKAGES**
> 
We will first import the packages nessary to develop and run our models

In [297]:
# Basic data manipulation 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 

# Data processing packages
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

# Random Forest packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, make_scorer

# Boosting packages
from sklearn.ensemble import GradientBoostingClassifier


**IMPORTING DATA**
> 
We can now begin to import our data from the csv files

In [298]:
DATA_PATH = "../input/icr-identify-age-related-conditions/"

# Read data in from files
# Importing data set
init_data = pd.read_csv(DATA_PATH + "train.csv") 
extra_data = pd.read_csv(DATA_PATH + "greeks.csv") 
train_data = pd.merge(init_data, extra_data, on='Id')

# Data Inputs (Readings)
X = train_data.drop(["Class", "Alpha", "Epsilon"], axis=1)
y = train_data.loc[:, ["Class", "Alpha"]] # Data Outputs (Class labels)

print(X.head())

             Id        AB          AF          AH         AM        AR  \
0  000ff2bfdfe9  0.209377  3109.03329   85.200147  22.394407  8.138688   
1  007255e47698  0.145282   978.76416   85.200147  36.968889  8.138688   
2  013f2bd269f5  0.470030  2635.10654   85.200147  32.360553  8.138688   
3  043ac50845d5  0.252107  3819.65177  120.201618  77.112203  8.138688   
4  044fb8a146ec  0.380297  3733.04844   85.200147  14.103738  8.138688   

         AX        AY         AZ          BC  ...        FS         GB  \
0  0.699861  0.025578   9.812214    5.555634  ...  0.094822  11.339138   
1  3.632190  0.025578  13.517790    1.229900  ...  0.568932   9.292698   
2  6.732840  0.025578  12.824570    1.229900  ...  1.198821  37.077772   
3  3.685344  0.025578  11.053708    1.229900  ...  0.284466  18.529584   
4  3.942255  0.054810   3.396778  102.151980  ...  0.121914  16.408728   

           GE            GF         GH         GI         GL  Beta  Gamma  \
0   72.611063   2003.810319  22.1

**ONE HOT ENCODING**

We need to change the alpha "type of positive class" column to one hot encoding, since this is currently catagorical

In [299]:
# Change the main class column to now represent a one hot encoding of
# the "negative" class
y["Class"] = y["Class"].map({0: 1, 1: 0})

def one_hot_encode(col: str, data):
    # create the cols, one for each class of the catagorical variable
    one_hot_encoded = pd.get_dummies(data[col], prefix=col, prefix_sep='_')
    
    # Concatenate the one-hot encoded columns with the original DataFrame
    data = pd.concat([data, one_hot_encoded], axis=1)
    
    # Drop the old column
    data = data.drop(col, axis=1)
    return data

# one hot encode the classes in the output
y = one_hot_encode("Alpha", y)
# drop Alpha_A since it is a the same as the neg column
y = y.drop("Alpha_A", axis=1)
y = y.rename(columns={"Class":0, "Alpha_B":1, "Alpha_D":2, "Alpha_G":3,})

y = y.idxmax(axis=1)


# one hot encode the catagoral vars in X
X = X.drop(["Beta", "Gamma", "Delta"], axis =1)
#encoder = OneHotEncoder(handle_unknown='ignore')
#encoder.fit(X.loc[:, ["Beta", "Gamma", "Delta"]])
#X = pd.DataFrame(encoder.transform(X.loc[:, ["Beta", "Gamma", "Delta"]]).toarray())

print(X.head())

             Id        AB          AF          AH         AM        AR  \
0  000ff2bfdfe9  0.209377  3109.03329   85.200147  22.394407  8.138688   
1  007255e47698  0.145282   978.76416   85.200147  36.968889  8.138688   
2  013f2bd269f5  0.470030  2635.10654   85.200147  32.360553  8.138688   
3  043ac50845d5  0.252107  3819.65177  120.201618  77.112203  8.138688   
4  044fb8a146ec  0.380297  3733.04844   85.200147  14.103738  8.138688   

         AX        AY         AZ          BC  ...         FI        FL  \
0  0.699861  0.025578   9.812214    5.555634  ...   3.583450  7.298162   
1  3.632190  0.025578  13.517790    1.229900  ...  10.358927  0.173229   
2  6.732840  0.025578  12.824570    1.229900  ...  11.626917  7.709560   
3  3.685344  0.025578  11.053708    1.229900  ...  14.852022  6.122162   
4  3.942255  0.054810   3.396778  102.151980  ...  13.666727  8.153058   

         FR        FS         GB          GE            GF         GH  \
0   1.73855  0.094822  11.339138   72

**INITAL OBSERVATIONS**

We can look at the data to check what the data types and format of the data is. 

In [300]:
print(X.head()) 
print(y.head())
# All data other than Id and EJ is numerical

print(f"Null values in X: {X.isnull().sum().sum()}")
# There are a few null values which we will have to check out
print(f"Null values in y: {y.isnull().sum().sum()}")
# No null values in our lables which is good

             Id        AB          AF          AH         AM        AR  \
0  000ff2bfdfe9  0.209377  3109.03329   85.200147  22.394407  8.138688   
1  007255e47698  0.145282   978.76416   85.200147  36.968889  8.138688   
2  013f2bd269f5  0.470030  2635.10654   85.200147  32.360553  8.138688   
3  043ac50845d5  0.252107  3819.65177  120.201618  77.112203  8.138688   
4  044fb8a146ec  0.380297  3733.04844   85.200147  14.103738  8.138688   

         AX        AY         AZ          BC  ...         FI        FL  \
0  0.699861  0.025578   9.812214    5.555634  ...   3.583450  7.298162   
1  3.632190  0.025578  13.517790    1.229900  ...  10.358927  0.173229   
2  6.732840  0.025578  12.824570    1.229900  ...  11.626917  7.709560   
3  3.685344  0.025578  11.053708    1.229900  ...  14.852022  6.122162   
4  3.942255  0.054810   3.396778  102.151980  ...  13.666727  8.153058   

         FR        FS         GB          GE            GF         GH  \
0   1.73855  0.094822  11.339138   72

**PRE-PROCESSING DATA**
> 
We will first remove the ID column as that will not help us in any of our predictions as it is not related to the task

We will next imput the null values with the mean of that column


In [301]:
def process_data(data):
    # Drop ID column
    data = data.drop("Id", axis=1)

    # We can observe that EJ is catagorical but only takes two values
    # A or B so we will set WLOG A to 1 and B to 0
    #print(data["EJ"].head())
    data["EJ"] = data["EJ"].map({'A': 1, 'B': 0})
    #print(data["EJ"].head())

    # Imput Null values with mean
    data = data.fillna(data.mean(numeric_only=True))
    #print(f"Null values in X: {data.isnull().sum().sum()}")
    
    return data

X = process_data(X)


## since testing data doesnt contain the meta data:


**TRAIN TEST SPLIT**

We will split the data in a training set and a hold out set for future validation

In [302]:
# split the data into a training and hold out set
X_train, X_ho, y_train, y_ho = train_test_split(X, y, test_size=0.3)

**CREATE RANDOM FOREST MODEL**

We will create simple random forest model to predict the data then compare our predictions with the hold out data set

In [303]:
# Construct the model
model = RandomForestClassifier(n_estimators=100, class_weight="balanced", criterion="log_loss")  
model.fit(X_train, y_train)

y_pred_probs = model.predict_proba(X_ho)

#y_pred = np.argmax(y_pred_probs,axis = 1)
#print(y_pred_probs[:5])

def conv_to_2_class(y_p_probs):
    y_fin = [[0, 0] for _ in range(len(y_p_probs))]
    for i, probs in enumerate(y_p_probs):
        y_fin[i][0] = probs[0]
        y_fin[i][1] = 1 - probs[0]
    
    return y_fin
y_final = conv_to_2_class(y_pred_probs)
y_pred = np.argmax(y_final, axis=1)

print(y_final[:5])
#print(y_pred[:5])

y_ho = y_ho.replace({2: 1, 3: 1})
print(y_ho)

accuracy = accuracy_score(y_ho, y_pred)
print("Accuracy against Hold Out set:", accuracy)

[[0.94, 0.06000000000000005], [0.76, 0.24], [0.91, 0.08999999999999997], [0.98, 0.020000000000000018], [0.93, 0.06999999999999995]]
375    0
575    0
313    1
187    0
429    0
      ..
436    1
314    0
98     0
482    0
382    0
Length: 186, dtype: int64
Accuracy against Hold Out set: 0.9139784946236559


**K FOLD CROSS VALIDATION**

We will do the same as we just did, however will apply futher testing to see which model will preform the best overall not just on the given split

In [304]:
# Import k fold cross validation modules
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

k = 10  # Number of folds
kfold = KFold(n_splits=k, shuffle=True)  # Create KFold object

# Test gradient boosting model
model = RandomForestClassifier(n_estimators=100, class_weight="balanced", criterion="log_loss")  

# Define the log loss scorer
log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1, 2, 3])
scores = cross_val_score(model, X, y, cv=kfold, scoring=log_loss_scorer)
print("Accuracy with Random Forests:", scores.mean())

Accuracy with Random Forests: -0.31123111912494483


**PREDICT TEST DATA**

We will now predict the unknown test data with this model

In [305]:
# Construct the model
model = RandomForestClassifier(n_estimators=100, class_weight="balanced", criterion="log_loss")  
model.fit(X, y)

# Import the data to predict
X_test = pd.read_csv(DATA_PATH + "test.csv")

ids = X_test.loc[:, "Id"]
# Do data processing
X_test = process_data(X_test)

print(X_test.head())

# predict results
y_test_init = model.predict_proba(X_test)
y_test_probs = conv_to_2_class(y_test_init)
print(y_test_probs)

submissions = pd.read_csv(DATA_PATH + "sample_submission.csv")
# We will change the Sample Submission value
submissions[['class_0', 'class_1']] = y_test_probs

submissions.to_csv("submission.csv", index=False)

    AB   AF   AH   AM   AR   AX   AY   AZ   BC  BD   ...   FI   FL   FR   FS  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

    GB   GE   GF   GH   GI   GL  
0  0.0  0.0  0.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  0.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 56 columns]
[[0.62, 0.38], [0.62, 0.38], [0.62, 0.38], [0.62, 0.38], [0.62, 0.38]]
