### Prospeção de Dados 2023/2024 - Course Project
#### Professor: André Falcão
#### Alunos: 

Catherine Prokhorov (62608) - XX Hours <br>
Guilherme Cepeda (62931) - XX Hours <br>
Jorge Aleluia (54549) - XX Hours <br>
Rómulo Nogueira (56935) - XX Hours <br>

#### Library

In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

In [5]:
train_columns = ['mol_id', 'chembl_id', 'activity']
test_columns = ['mol_id', 'chembl_id', 'activity']

## import the data
activity_train = pd.read_csv("activity_train.csv", names = train_columns, header = None)
activity_test_blanked = pd.read_csv("activity_test_blanked.csv", names = test_columns, header = None)

with open("mol_bits.pkl", "rb") as f:
    mol_bits = pickle.load(f)

In [9]:
print('Data Exploration')
print(activity_train.head())
print(activity_train.describe())

print('Check for missing values')
print(activity_train.isnull().sum())

Data Exploration
   mol_id       chembl_id  activity
0  O14842   CHEMBL2022243         4
1  O14842   CHEMBL2022244         6
2  O14842   CHEMBL2022245         2
3  O14842   CHEMBL2022246         1
4  O14842   CHEMBL2022247         4
            activity
count  135711.000000
mean        4.708793
std         2.869907
min         1.000000
25%         2.000000
50%         5.000000
75%         7.000000
max        10.000000
Check for missing values
mol_id       0
chembl_id    0
activity     0
dtype: int64


In [10]:
print('Data Exploration')
print(activity_test_blanked.head())
print(activity_test_blanked.describe())

print('Check for missing values')
print(activity_test_blanked.isnull().sum())

Data Exploration
   mol_id       chembl_id  activity
0  O14842   CHEMBL2022258         0
1  O14842   CHEMBL2047161         0
2  O14842   CHEMBL2047163         0
3  O14842   CHEMBL2047168         0
4  O14842   CHEMBL2047169         0
       activity
count    4628.0
mean        0.0
std         0.0
min         0.0
25%         0.0
50%         0.0
75%         0.0
max         0.0
Check for missing values
mol_id       0
chembl_id    0
activity     0
dtype: int64


In [None]:
mol_bits

In [4]:
X_train_activity = activity_train.drop(columns=['activity'])
y_train_activity = activity_train['activity']

# print(X_train.head())
# print(y_train.head())

In [5]:
# Converter fingerprints para uma representação binária
vectorizer = DictVectorizer(sparse = False)

# dados ausentes
def map_mol_bits(mol_id):
    if mol_id in mol_bits:
        return mol_bits[mol_id]
    else:
        return []

X_train_bits = [dict(zip(bits, [1]*len(bits))) for bits in activity_train['mol_id'].map(map_mol_bits)]
X_test_bits = [dict(zip(bits, [1]*len(bits))) for bits in activity_test_blanked['mol_id'].map(map_mol_bits)]

# transformar em matriz binária 
X_train_matrix = vectorizer.fit_transform(X_train_bits)
X_test_matrix = vectorizer.fit_transform(X_test_bits)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_train_matrix, y_train_activity, test_size=0.2, random_state=42)

In [11]:
# Convert mol_bits to DataFrame
mol_bits_df = pd.DataFrame.from_dict(mol_bits, orient='index').reset_index()
mol_bits_df.columns = ['chembl_id'] + [f'bit_{i}' for i in range(mol_bits_df.shape[1] - 1)]

# Merge with activity data
train_data = activity_train.merge(mol_bits_df, on='chembl_id', how='left')
test_data = activity_test_blanked.merge(mol_bits_df, on='chembl_id', how='left')

# Drop unnecessary columns
X_train = train_data.drop(columns=['mol_id', 'chembl_id', 'activity'])
y_train = train_data['activity']
X_test = test_data.drop(columns=['mol_id', 'chembl_id', 'activity'])


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Split the training data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

# Evaluate the model
y_pred_val = model.predict(X_val_split)
mse = mean_squared_error(y_val_split, y_pred_val)
print(f'Mean Squared Error on Validation Set: {mse}')

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Predict on test data
y_pred_test = model.predict(X_test)
test_data['activity'] = y_pred_test

# Save predictions to file
test_data[['mol_id', 'chembl_id', 'activity']].to_csv('preds_xx.txt', index=False, header=False)
