In [None]:
import pandas as pd
import numpy as np
import re
import os

### Data Wrangling

In [2]:
# Specify column names while reading the file
column_names = ['V1', 'V2', 'V3', 'V4', 'V5']  # Modify based on the actual structure of your file
df = pd.read_csv('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_DNA.txt', delimiter='\t', header=None, names=column_names)

# Display the DataFrame
print(df)

                 V1         V2          V3         V4        V5
0         chr1(mat)  169520000   chr1(mat)  135520000  0.630130
1         chr1(mat)  169520000   chr1(mat)  135540000  0.730330
2         chr1(mat)  169520000   chr1(mat)  170180000  0.879750
3         chr1(mat)  169520000   chr1(mat)  170200000  0.961733
4         chr1(mat)  169520000  chr11(mat)   57780000  0.789122
...             ...        ...         ...        ...       ...
3638154  chr19(pat)    8900000  chr17(mat)   11920000       NaN
3638155  chr19(pat)    8900000  chr19(pat)    9480000  0.266266
3638156  chr19(pat)    8900000  chr19(pat)    9500000  0.266266
3638157  chr19(pat)    8900000   chr8(mat)   17220000       NaN
3638158  chr19(pat)    8900000   chr8(mat)   17240000       NaN

[3638159 rows x 5 columns]


In [None]:
# Define chromosome names
chromosomes = [
    "1m", "2m", "3m", "4m", "5m", "6m", "7m", "8m", "9m", "10m",
    "11m", "12m", "13m", "14m", "15m", "16m", "17m", "18m", "19m",
    "1p", "2p", "3p", "4p", "5p", "6p", "7p", "8p", "9p", "10p",
    "11p", "12p", "13p", "14p", "15p", "16p", "17p", "18p", "19p"
]

# Function to map original chromosome names to new format
def map_chr(chr):
    match = re.match(r'chr(\d+)\((mat|pat)\)', chr)
    if match:
        num = match.group(1)
        type_ = match.group(2)
        return f"{num}{'m' if type_ == 'mat' else 'p'}"
    else:
        return chr

# Apply the mapping to the dataframe
df['V1'] = df['V1'].apply(map_chr)
df['V3'] = df['V3'].apply(map_chr)

### Feature Engineering
Creating the repeats matrices to be converted into vectors and added as features in the fourth feature matrix.

In [None]:
# Initialize the matrix
interaction_matrix = pd.DataFrame(0, index=chromosomes, columns=chromosomes)

# Populate the matrix
for i in range(len(df)):
    row_name = df.iloc[i]['V1']
    col_name = df.iloc[i]['V3']
    
    # Ignore interactions involving "chr20(mat)" and "chr20(pat)" in V3
    if col_name in ['chr20(mat)', 'chr20(pat)']:
        continue
    
    if row_name in interaction_matrix.index and col_name in interaction_matrix.columns:
        interaction_matrix.at[row_name, col_name] += 1
        interaction_matrix.at[col_name, row_name] += 1  # Ensure symmetric update

# Check if the matrix is symmetric
if interaction_matrix.equals(interaction_matrix.T):
    print("The matrix is symmetric.")
else:
    print("The matrix is not symmetric.")
    
    # Find discrepancies
    discrepancies = np.where(interaction_matrix != interaction_matrix.T)
    discrepancy_indices = list(zip(discrepancies[0], discrepancies[1]))
    print(discrepancy_indices)
    
    for index in discrepancy_indices:
        row_idx, col_idx = index
        print(f"Discrepancy at ({interaction_matrix.index[row_idx]}, {interaction_matrix.columns[col_idx]}): {interaction_matrix.iat[row_idx, col_idx]} vs {interaction_matrix.iat[col_idx, row_idx]}")

The matrix is symmetric.


In [9]:
interaction_matrix

Unnamed: 0,1m,2m,3m,4m,5m,6m,7m,8m,9m,10m,...,10p,11p,12p,13p,14p,15p,16p,17p,18p,19p
1m,192346,2981,1811,2253,2233,2252,2772,2076,2755,2429,...,2193,2580,2000,1977,1636,1653,1692,1210,1521,1471
2m,2981,193460,2246,2358,3076,2513,2902,1753,1834,2338,...,2415,3602,1882,2056,1819,1323,1507,1382,1776,1144
3m,1811,2246,131700,1494,1833,1589,1640,1441,1908,1425,...,1846,2166,1543,1463,1458,1286,960,1244,1306,777
4m,2253,2358,1494,143152,1956,2034,2179,1678,2141,1984,...,2214,1881,2209,1852,1219,1562,1421,1050,1416,1208
5m,2233,3076,1833,1956,155792,2272,2490,1672,1852,1966,...,2193,2394,1896,2034,1879,1459,1611,1372,1808,1038
6m,2252,2513,1589,2034,2272,176890,2335,2334,2434,2041,...,2492,2636,1979,1753,1335,1375,1696,1314,1869,1341
7m,2772,2902,1640,2179,2490,2335,154382,1516,2083,1974,...,1912,2443,1921,2125,1095,1319,1735,1201,1412,1035
8m,2076,1753,1441,1678,1672,2334,1516,134300,2007,1631,...,992,1901,1949,1094,1469,1377,1309,747,1316,1429
9m,2755,1834,1908,2141,1852,2434,2083,2007,150610,2006,...,1332,1483,1719,1445,1402,1530,1652,1207,1458,1337
10m,2429,2338,1425,1984,1966,2041,1974,1631,2006,142786,...,1557,2392,1689,1244,1287,1597,1013,1230,1634,1189


In [None]:
def process_chromosome_interactions(input_file, output_dir):
    # Define chromosome names
    chromosomes = [
        "1m", "2m", "3m", "4m", "5m", "6m", "7m", "8m", "9m", "10m",
        "11m", "12m", "13m", "14m", "15m", "16m", "17m", "18m", "19m",
        "1p", "2p", "3p", "4p", "5p", "6p", "7p", "8p", "9p", "10p",
        "11p", "12p", "13p", "14p", "15p", "16p", "17p", "18p", "19p"
    ]

    # Function to map original chromosome names to new format
    def map_chr(chr):
        match = re.match(r'chr(\d+)\((mat|pat)\)', chr)
        if match:
            num = match.group(1)
            type_ = match.group(2)
            return f"{num}{'m' if type_ == 'mat' else 'p'}"
        else:
            return chr

    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_file, delimiter='\t', header=None, names=['V1', 'V2', 'V3', 'V4', 'V5'])

    # Apply the mapping to the dataframe
    df['V1'] = df['V1'].apply(map_chr)
    df['V3'] = df['V3'].apply(map_chr)

    # Initialize the matrix
    interaction_matrix = pd.DataFrame(0, index=chromosomes, columns=chromosomes)

    # Populate the matrix
    for i in range(len(df)):
        row_name = df.iloc[i]['V1']
        col_name = df.iloc[i]['V3']
        
        # Ignore interactions involving "chr20(mat)" and "chr20(pat)" in V3
        if col_name in ['20m', '20p']:
            continue
        
        if row_name in interaction_matrix.index and col_name in interaction_matrix.columns:
            interaction_matrix.at[row_name, col_name] += 1
            interaction_matrix.at[col_name, row_name] += 1  # Ensure symmetric update

    # Check if the matrix is symmetric
    if interaction_matrix.equals(interaction_matrix.T):
        print("The matrix is symmetric.")
    else:
        print("The matrix is not symmetric.")
        
        # Find discrepancies
        discrepancies = np.where(interaction_matrix != interaction_matrix.T)
        discrepancy_indices = list(zip(discrepancies[0], discrepancies[1]))
        print(discrepancy_indices)
        
        for index in discrepancy_indices:
            row_idx, col_idx = index
            print(f"Discrepancy at ({interaction_matrix.index[row_idx]}, {interaction_matrix.columns[col_idx]}): {interaction_matrix.iat[row_idx, col_idx]} vs {interaction_matrix.iat[col_idx, row_idx]}")

    # Save the matrix as a CSV file
    output_file = os.path.join(output_dir, os.path.basename(input_file).replace('.txt', '.csv'))
    interaction_matrix.to_csv(output_file)
    print(f"Matrix saved to {output_file}")

# Example usage:
# process_chromosome_interactions('/path/to/input_file.txt', '/path/to/output_dir')


In [4]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_DNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_DNA.csv


In [5]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_LINE.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_LINE.csv


In [6]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_LTR.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_LTR.csv


In [7]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_Low_complexity.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_Low_complexity.csv


In [8]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_Other.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_Other.csv


In [9]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_RC.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_RC.csv


In [10]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_RNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_RNA.csv


In [11]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_SINE.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_SINE.csv


In [12]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_Satellite.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_Satellite.csv


In [13]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_Simple_repeat.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_Simple_repeat.csv


In [14]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_Unknown.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_Unknown.csv


In [15]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_rRNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_rRNA.csv


In [3]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_scRNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_scRNA.csv


In [4]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_snRNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_snRNA.csv


In [5]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_srpRNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_srpRNA.csv


In [6]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_tRNA.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_tRNA.csv


In [3]:
process_chromosome_interactions('/rds/projects/v/varnaic-schic-ml/Data/esc_3dg/topospace_SE160.txt', '/rds/homes/s/sxc1561/Thesis/repeats_csv')

The matrix is symmetric.
Matrix saved to /rds/homes/s/sxc1561/Thesis/repeats_csv/topospace_SE160.csv
