# Predicting a Biological Response

Predict a biological response of molecules from their chemical properties.

## Importing the data

In [92]:
import os
import pandas as pd
import numpy as np 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

def correlated_features(df, threshold):
    correlation_matrix = df.corr().abs()
    unit_matrix = np.ones(correlation_matrix.shape)
    triangle = np.triu(unit_matrix, k=1).astype(np.bool)
    upper = correlation_matrix.where(triangle)
    to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
    return df.drop(df[to_drop], axis=1)

def k_best(df, k, n):
    X = df.iloc[:, 1:len(df.columns) + 1]
    y = df.iloc[:, 0]
    
    best_features = SelectKBest(score_func=chi2, k=k)
    fit = best_features.fit(X, y)
    df_scores = pd.DataFrame(fit.scores_)
    df_columns = pd.DataFrame(X.columns)

    feature_scores = pd.concat([ df_columns, df_scores ], axis=1)
    feature_scores.columns = ['Name', 'Score']
    selected_features = feature_scores.nlargest(n, 'Score')

    return list(selected_features['Name'])

def best_features(df):
    df = correlated_features(df, 0.95)
    df = k_best(df, 10, 10)
    return df

## Selecting features


We use chi2 to select the best features.

In [93]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, log_loss

def train_rfc():
    selected_features = best_features(train)
    selected_features.sort()
    
    columns = ['Activity'] + selected_features
    
    trimmed_train = train[columns]
    
    X = trimmed_train.iloc[:, 1:len(trimmed_train.columns) + 1]
    y = trimmed_train.iloc[:, 0]

    rfc = RandomForestClassifier(n_estimators=100, random_state=0)
    rfc.fit(X, y)
    return selected_features, rfc
    
selected_features, rfc = train_rfc()

selected_features

['D1036',
 'D1061',
 'D1087',
 'D1169',
 'D1196',
 'D1281',
 'D1309',
 'D27',
 'D469',
 'D959']

In [94]:
import os
from pathlib import Path

X_filtered = test[selected_features]

probabilities = [ x[1] for x in rfc.predict_proba(X_filtered) ]
indexes = [n + 1 for n in range(2502)]

df = pd.DataFrame(
    list(zip(indexes, probabilities)), 
    columns=["MoleculeId", "PredictedProbability"]
)

df.to_csv(index=False, path_or_buf= Path().absolute().joinpath(Path("predicted.csv")))

In [84]:
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# 
# X = train.iloc[:, 1:len(train.columns) + 1]
# y = train.iloc[:, 0]
# 
# rfc = GradientBoostingClassifier(n_estimators=20, random_state=0)
# rfc.fit(X, y)
# probabilities = [ x[1] for x in rfc.predict_proba(test)]
# 
# df = pd.DataFrame(
#     list(zip(indexes, probabilities)), 
#     columns=["MoleculeId", "PredictedProbability"]
# )
# 
# df

Unnamed: 0,MoleculeId,PredictedProbability
0,1,0.539991
1,2,0.823434
2,3,0.318652
3,4,0.798110
4,5,0.247078
...,...,...
2496,2497,0.722432
2497,2498,0.216729
2498,2499,0.789620
2499,2500,0.769816


In [91]:
df.to_csv(index=False, path_or_buf= Path().absolute().joinpath(Path("predicted.csv")))
