In [1]:
import os
import numpy as np
import pandas as pd
import concurrent.futures
from scripts.sequence_stuff import *
from scripts.plots import *
from scripts.graph_stuff import *
from matplotlib import pyplot as plt
from scripts.collector import *


In [2]:
specific_file = 'data/long_reads/GRIA-CNS-RESUB.C0x1291.aligned.sorted.MinRQ998.reads.degenerate.csv'
saved_collector = specific_file.replace('.csv', '_part_1_collector.pkl')
collector = SequenceCollector()
collector.load(saved_collector)
collector.print_sizes()

Picked sequences: 2126
Working sequences: 31233
Removed sequences: 41.02%
Removed X: 43.97%


In [3]:
only_possible_assignment = get_only_possible_assignments(collector.get_working_sequences(), up_to_k=7)
collector.collect_picked_sequences(only_possible_assignment)
collector.update()
collector.print_sizes()

Picked sequences: 2245
Working sequences: 30648
Removed sequences: 42.12%
Removed X: 44.49%


In [4]:
graph = build_graph(collector.get_working_sequences())
connected_components = list(nx.connected_components(graph))

print(f"Number of connected components: {len(connected_components)}")
for i, component in enumerate(connected_components):
    print(f"Component {i} has {len(component)} sequences")

Adding edges for sequence 0/30647
Adding edges for sequence 100/30647
Adding edges for sequence 200/30647
Adding edges for sequence 300/30647
Adding edges for sequence 400/30647
Adding edges for sequence 500/30647
Adding edges for sequence 600/30647
Adding edges for sequence 700/30647
Adding edges for sequence 800/30647
Adding edges for sequence 900/30647
Adding edges for sequence 1000/30647
Adding edges for sequence 1100/30647
Adding edges for sequence 1200/30647
Adding edges for sequence 1300/30647
Adding edges for sequence 1400/30647
Adding edges for sequence 1500/30647
Adding edges for sequence 1600/30647
Adding edges for sequence 1700/30647
Adding edges for sequence 1800/30647
Adding edges for sequence 1900/30647
Adding edges for sequence 2000/30647
Adding edges for sequence 2100/30647
Adding edges for sequence 2200/30647
Adding edges for sequence 2300/30647
Adding edges for sequence 2400/30647
Adding edges for sequence 2500/30647
Adding edges for sequence 2600/30647
Adding edges 

In [5]:
# get the biggest connected component
biggest_component = max(connected_components, key=len)
print(f"Biggest component has {len(biggest_component)} sequences")

collector.set_working_sequences(biggest_component)


Biggest component has 30364 sequences


In [6]:
# get all sequences in it that have exactly 1 neighbor
sequences_with_1_neighbor = [node for node in biggest_component if len(list(graph.neighbors(node))) == 1]
print(f"Number of sequences in the biggest component with 1 neighbor: {len(sequences_with_1_neighbor)}")

Number of sequences in the biggest component with 1 neighbor: 1864


In [7]:
# a lonely sequence is a sequence that has one neighbor

# TODO: EDGE CASE WHEN ITS A CC OF 2 NODES, SO THEY DONT DELETE EACH OTHER AND ADD 2 SEQUENCES
# TODO: EDGE CASE WHEN ITS A CC OF 2 NODES, SO THEY DONT DELETE EACH OTHER AND ADD 2 SEQUENCES
# TODO: EDGE CASE WHEN ITS A CC OF 2 NODES, SO THEY DONT DELETE EACH OTHER AND ADD 2 SEQUENCES

# get all sequences that have a neighbor that has exactly 1 neighbor
lonely_sequences_mapping = {}
sequences_to_remove = []

print("Finding lonely sequences")

for node in biggest_component:
    neighbors = list(graph.neighbors(node))
    lonely_neighbores = []
    remove_seq = False
    for neighbor in neighbors:
        if len(list(graph.neighbors(neighbor))) == 1:
            lonely_sequences_mapping[neighbor] = merge_sequences(neighbor, node)
            remove_seq = True


    if remove_seq:
        sequences_to_remove.append(node)

all_seqs = biggest_component

print(f"Number of lonely sequences: {len(lonely_sequences_mapping)}")
print(f"Number of sequences to remove: {len(sequences_to_remove)}")


new_all_seqs = []
for seq in all_seqs:
    if seq not in sequences_to_remove:
        if seq not in lonely_sequences_mapping:
            new_all_seqs.append(seq)
        else:
            new_all_seqs.append(lonely_sequences_mapping[seq])
    

collector.set_working_sequences(new_all_seqs)
collector.update()
collector.print_sizes()


Picked sequences: 2293
Working sequences: 29119
Removed sequences: 45.01%
Removed X: 49.11%


In [8]:
collector.random_assignment_for_isolated()
collector.print_sizes()

Picked sequences: 4388
Working sequences: 27024
Removed sequences: 48.97%
Removed X: 51.72%


In [9]:
# save as pandas df
df = pd.DataFrame(collector.get_working_sequences(), columns=['Sequence'])
df.to_csv(specific_file.replace('.csv', 'biggest_cc.csv'), index=False)

# create graph
graph = build_graph(collector.get_working_sequences())
nx.write_graphml(graph, "graph.graphml")


Adding edges for sequence 0/27023
Adding edges for sequence 100/27023
Adding edges for sequence 200/27023
Adding edges for sequence 300/27023
Adding edges for sequence 400/27023
Adding edges for sequence 500/27023
Adding edges for sequence 600/27023
Adding edges for sequence 700/27023
Adding edges for sequence 800/27023
Adding edges for sequence 900/27023
Adding edges for sequence 1000/27023
Adding edges for sequence 1100/27023
Adding edges for sequence 1200/27023
Adding edges for sequence 1300/27023
Adding edges for sequence 1400/27023
Adding edges for sequence 1500/27023
Adding edges for sequence 1600/27023
Adding edges for sequence 1700/27023
Adding edges for sequence 1800/27023
Adding edges for sequence 1900/27023
Adding edges for sequence 2000/27023
Adding edges for sequence 2100/27023
Adding edges for sequence 2200/27023
Adding edges for sequence 2300/27023
Adding edges for sequence 2400/27023
Adding edges for sequence 2500/27023
Adding edges for sequence 2600/27023
Adding edges 

In [10]:
# load the graph
graph = nx.read_graphml("graph.graphml")
