# EDA of raw data
Notebook for exploring and highlightning

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

def change_directory_to_repo():
    """Changes working directory to the repository root folder."""
    current_dir = Path.cwd()
    for parent in current_dir.parents:
        # Repository is the first folder with the .git folder
        files = list(parent.glob(".git"))
        if files:
            os.chdir(str(parent))

change_directory_to_repo()

In [21]:
smiles = pd.read_csv('data/raw/smiles.tsv.gz', compression='gzip', sep='\t')
activities = pd.read_csv('data/raw/activities.tsv.gz', compression='gzip', sep='\t')

There are molecules with several affinity values. Probably these are different experiments. Since we have no prior, we will use mean affinity value.

In [22]:
activities['molregno'].value_counts()

2214      46
2261      37
3683      34
7714      21
34197     20
          ..
296630     1
296837     1
296817     1
296660     1
9988       1
Name: molregno, Length: 6524, dtype: int64

Let's see how big our dataset is.

In [23]:
id_with_affinity = set(activities["molregno"].to_list())
all_id = set(smiles["molregno"].to_list())
smiles_with_affinity = all_id.intersection(id_with_affinity)
len(smiles_with_affinity)

6492

Let's do first pipeline step: filter values and binarize affinity into 1 and 0 -- active and non-active molecules.

In [25]:
from scripts.preprocessing import filter

# Minimal value to consider a ligand to be active
threshold = 8.0  

# Keep molecules with null affinity value. Consider null molecules as not active, if true.
keep_null = True
filtered = filter.filter_dataset(smiles, activities, smiles_with_affinity, threshold, keep_null)