# 0. Libraries

In [1]:
import re
import pandas as pd
import numpy as np
import random
import seaborn as sns
import os
from scipy.stats import zscore
from scipy.stats import norm

import time

%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.axes_grid1 import make_axes_locatable
from ipywidgets import interact


from itertools import product

from sklearn.manifold import TSNE

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def save_list_to_txt(lst, file_path):
    with open(file_path, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

# 1. Load and verify data

In [4]:
def f_load(path,parameter):
    # This function allows us to load the data of one parameter
    # path: the location of the files
    # parameters: filtered, fixed, f_optimized
    # Read data
    df_ini= pd.read_csv(path+parameter+".csv")

    # Validate that the results are numbers and there are not special characters
    df_ini["Operateur_Resultat"]=df_ini["Operateur_Resultat"].astype(float)  

    return df_ini

# 5. Generate vector

In [5]:
def f_no_dup(df_base,df_produit,duplicated_sys,label):
    data=np.array(df_produit['Operateur_Resultat']).reshape(1, -1)
    ones=np.sum(data==1)
    df_generic = pd.DataFrame(data=data, columns=df_produit[label])
    large=df_generic.shape[1]
    df_base=pd.concat([df_base, df_generic])
    # print('cols',len(list(df_base.columns)),list(df_base.columns)) # Verify all columns are in the same order
    return df_base.to_numpy()[0],large,ones

In [6]:
def f_vector(df_group,unique_sys,label):
    # Create a new dataframe that will contain one column for each existent system for all products
    # We will not get 'duplicated' rows as we decided to keep only the last result gotten
    
    #### CODE
    # Initialize -> create columns product to identify product,
    # index to differentiate duplicated products
    # dup to know if we have duplicated for this product
    
    output_cols=['Product','Dup','Vector','Nb_Test','Toxic']
    df_base = pd.DataFrame(columns=unique_sys)
    df_vec = pd.DataFrame(columns=output_cols)
    dic_dup_2={}
    
    for i in df_group['Produit'].unique():
        df_produit=df_group[df_group['Produit']==i]
        duplicated_sys = df_produit[label][df_produit[label].duplicated(keep=False)].unique()
        
        if len(duplicated_sys)==0: # Case 1, we don't have duplicated values in 'Systeme_Etudie'
            index=1
            dup=0
            vec,large,ones=f_no_dup(df_base,df_produit,duplicated_sys,label) 
            mat=[i,dup,vec,large,ones]
            temp=pd.DataFrame(index=output_cols,data=mat).T
            df_vec=pd.concat([df_vec,temp])
            
        else: # Case 2, we have duplicated values in 'Systeme_Etudie' -> Not expected
            dup=1
            vec=[]
            large=0
            ones=0
            mat=[i,dup,vec,large,ones]
            temp=pd.DataFrame(index=output_cols,data=mat).T
            df_vec=pd.concat([df_vec,temp])
            
            # To verify that we don't have duplicates
            temp=str(i)
            dic_dup_2[temp]=dic_dup
            
    df_vec.columns = output_cols
    df_vec = df_vec.reset_index(drop=True)
    return df_vec,dic_dup_2

In [7]:
def fv_large(vector):
    fig, ax = plt.subplots(1, 1, figsize=(16, 5))
    # Histogram of outliers_p
    ax.hist(vector, bins=200, edgecolor='black')
    ax.set_title('Histogram of Number of Studied Systems by Product')
    ax.set_xlabel('Number of Studied Systems Tested')
    ax.set_ylabel('Frequency')
    plt.tight_layout()
    plt.show()

In [8]:
def fv_ones(vector):
    fig, ax = plt.subplots(1, 1, figsize=(16, 5))
   # Histogram of outliers_p
    ax.hist(vector, bins=200, edgecolor='black')
    ax.set_title('Histogram of Toxic Tests per Product')
    ax.set_xlabel('Number of Toxic Tests')
    ax.set_ylabel('Frequency')
    plt.tight_layout()
    plt.show()

In [9]:
# Get all systems 
def f_get_unique_ids(label,clean):
    label='Systeme_Etudie'
    unique_sys=sorted(clean[label].unique())
    file_name = 'unique'+label+'.txt'
    save_list_to_txt(unique_sys, file_name)
    return file_name

In [10]:
# Main

# 1. Load data
path=""
parameter='fixed_optimized'
label= 'Systeme_Etudie' #'Systeme_Etudie' is better as we don't have 'Reference_Societe' for all the tests, BE CONSISTENT WITH CODE 02
df_ready=f_load(path,parameter)

file_name=f_get_unique_ids(label,df_ready)
unique_sys=open(file_name, "r").read().splitlines()

# 2. Create a new dataframe that will contain one column with a vector of all the systems tested on each products
print('Start')
df_vec,dic_dup_2=f_vector(df_ready,unique_sys,label)

# 6. Save & Read
name ='vec_'+parameter+'.pkl'
df_vec.to_pickle(name)


# Verify that we keep a vector
df_vec2=pd.read_pickle(name)
df_vec2['Vector'][0]


Start


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [12]:
file_name

'uniqueSysteme_Etudie.txt'

In [11]:
df_vec

Unnamed: 0,Product,Dup,Vector,Nb_Test,Toxic
0,AGI-0074986_1E-05M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,4
1,AGI-0074986_1E-07M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0
2,AGI-0075104_1E-05M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0
3,AGI-0075104_1E-07M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0
4,AGI-0075107_1E-05M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",54,1
...,...,...,...,...,...
299,WEHI-1655027_1E-07M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0
300,WEHI-1655863_1E-05M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0
301,WEHI-1655863_1E-07M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0
302,WEHI-1655870_1E-05M,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",54,0


### About this part

1. We generated a vector for each Product and dose using all the tests available, all vectors were genetared using the same base so each row corresponds always to the same "Systeme_Etudie"
