In [1]:
'''
Import Block
'''

import pandas as pd
import csv
import numpy as np
import re
import itertools

In [2]:
'''
This block of code is used to get a list (descriptors) of the desired desriptors that come out of InfoGain. The file that is being used
right now is one that had 477 descriptors coming out of InfoGain.
'''

# Load the CSV file into a DataFrame
df = pd.read_csv(r'C:\Users\vishn\Downloads\REHS\Post_WEKA\Ouput (476) Data.csv')
# Get a list of the column headers (descriptors)
descriptors = df.columns.tolist()

In [3]:
'''
This block is used for calculating the sequence descriptors.
'''

def symmetry(a):
    '''
    Function to calculate the symmetry of a given sequence.
    Symmetry is defined as the number of identical base pairs starting from either end of the sequence.

    Parameters:
    a (str): The sequence whose symmetry is to be calculated.

    Returns:
    int: The calculated symmetry.
    '''
    count = 0
    start = 0
    last = len(a) - 1
    mid = len(a)//2

    while mid > start:
        if a[start] == a[last]:
            count = count + 1
            last = last - 1
            start = start + 1
        else:
            break
  
    return count

def calculate_descriptors(df):
    '''
    Function to calculate different descriptors for the miRNA sequences in the DataFrame.

    Parameters:
    df (pd.DataFrame): DataFrame with miRNA sequences.

    Returns:
    df (pd.DataFrame): DataFrame with calculated descriptors added.
    '''
    # Generate motifs
    base_pairs = ['A', 'U', 'C', 'G']
    two_base_motifs = [''.join(i) for i in itertools.product(base_pairs, repeat=2)]
    three_base_motifs = [''.join(i) for i in itertools.product(base_pairs, repeat=3)]
    four_base_motifs = [''.join(i) for i in itertools.product(base_pairs, repeat=4)]
    all_motifs = two_base_motifs + three_base_motifs + four_base_motifs

    for index, row in df.iterrows():
        if type(row['Sequence']) is str:
            seq = row['Sequence']
            a = seq.count('A')
            u = seq.count('U')
            c = seq.count('C')
            g = seq.count('G')

            N = len(seq)

            fa = a/N
            fu = u/N
            fc = c/N
            fg = g/N

            meanmass = (135.1 * a + 112.1 * u + 111.1 * c + 151.1 * g)/N

            hbonds = 2 * (a + u) + 3 * (c + g)

            sym = symmetry(seq)

            df.at[index, 'N'] = N
            df.at[index, 'A'] = a
            df.at[index, 'U'] = u
            df.at[index, 'C'] = c
            df.at[index, 'G'] = g

            df.at[index, 'fA'] = fa
            df.at[index, 'fU'] = fu
            df.at[index, 'fC'] = fc
            df.at[index, 'fG'] = fg

            df.at[index, 'Mean_Mass'] = meanmass

            df.at[index, 'H_Bonds'] = hbonds

            df.at[index, 'Symmetry'] = sym

            # motif calculations
            for motif in all_motifs:
                df.at[index, motif + "Sequence"] = 1 if motif in seq else 0

            # first 5 and last 5 base pair motifs
            first_five = seq[:5]
            last_five = seq[-5:]
            for motif in two_base_motifs + three_base_motifs + four_base_motifs:
                df.at[index, "'" + motif + " First'"] = 1 if motif in first_five else 0
            
            for motif in two_base_motifs + three_base_motifs + four_base_motifs:
                df.at[index, "'" + motif + " Last'"] = 1 if motif in last_five else 0

    return df


idf = pd.read_csv(r'C:\Users\vishn\Downloads\REHS\ALT TEST\ALT TEST SEQUENCE DATA - Copy.csv')
sequence_calcualted_df = calculate_descriptors(idf)

  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[index, motif + "Sequence"] = 1 if motif in seq else 0
  df.at[

In [17]:
def update_csv(df1):
    '''
    Function to map miRNA to gene targets.

    Parameters:
    df1 (pd.DataFrame): DataFrame with miRNA and gene target information.

    Returns:
    df (pd.DataFrame): Updated DataFrame with mapped miRNA to gene targets.
    '''
    # Store miRNA-gene relationships in a dictionary
    miRNA_gene_dict = {}

    for index, row in df1.iterrows():
        miRNA = row[1]
        gene = row[2]
        if miRNA in miRNA_gene_dict:
            miRNA_gene_dict[miRNA].append(gene)
        else:
            miRNA_gene_dict[miRNA] = [gene]

    # Extract unique gene targets
    unique_genes = list(set(df1.iloc[:, 2]))  # Converted set to list

    # Create new dataframe with miRNA as index and genes as columns
    df = pd.DataFrame(0, index=miRNA_gene_dict.keys(), columns=unique_genes)

    # Update dataframe with the information from miRNA_gene_dict
    for miRNA, gene_list in miRNA_gene_dict.items():
        for gene in gene_list:
            if gene in df.columns:
                df.loc[miRNA, gene] = 1

    return df

path_to_csv = pd.read_csv(r'C:\Users\vishn\Downloads\REHS\ALT TEST\BIG_ALT-RDB_99.csv')
updated_df = update_csv(path_to_csv)
updated_df

Unnamed: 0,SLC2A13,DOCK9,RRAGD,FAM8A1,PFN2,SERINC2,PI15,GLCE,SH3TC2,KLHL42,...,SOS2,AFF4,ZFR,SLIT2,GRIK2,PNRC1,FGF7,MMRN1,OSBPL11,KDM7A
hsa-miR-92a-3p,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-155-3p,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-21-3p,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-27a-3p,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
hsa-miR-200a-3p,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
hsa-miR-20b-5p,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-9-5p,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-183-5p,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hsa-miR-223-3p,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Combine the sequence and gene target dataframes
combined_df = pd.concat([sequence_calcualted_df.set_index('miRNA'), updated_df], axis=1)

# Start with your original DataFrame
original_df = combined_df.copy()

# Reindex your original DataFrame with the additional descriptors, fill missing values with 0
new_df = original_df.reindex(columns=descriptors, fill_value=0)

new_df.to_csv('Alt_Test_Data(476) for 99.csv')