In [7]:
import pandas as pd

# Path to your WIG file
wig_file_path = 'conserved_areas_data'

# Initialize lists to store the positions and scores
positions = []
scores = []
current_chrom = None
current_span = 1  # Default span is 1 bp per line if not otherwise specified

# Read the file line by line
with open(wig_file_path, 'r') as file:
    for line in file:
        # Skip comment lines and track/header lines
        if line.startswith('#') or line.startswith('track'):
            continue
        
        # Check for the variableStep line to update chromosome and span if specified
        if line.startswith('variableStep'):
            # Extract chromosome and span from this line
            parts = line.strip().split()
            for part in parts:
                if part.startswith('chrom='):
                    current_chrom = part.split('=')[1]
                elif part.startswith('span='):
                    current_span = int(part.split('=')[1])
            continue  # Move to the next line
        
        if line.startswith('-------'):
            break

        # Process data lines (position and score)
        position, score = line.strip().split()
        position = int(position)  # Convert position to integer
        score = float(score)      # Convert score to float

        # Append the data to the lists
        positions.append(position)
        scores.append(score)

# Create a DataFrame from the collected data
wig_df = pd.DataFrame({
    'position': positions,
    'score': scores
})

# Display the first few rows to check the DataFrame
print(wig_df.head())


   position     score
0     11702  0.544299
1     11703  0.637228
2     11704  0.670417
3     11705  0.816449
4     11706  0.829724


In [10]:
wig_df

Unnamed: 0,position,score
0,11702,0.544299
1,11703,0.637228
2,11704,0.670417
3,11705,0.816449
4,11706,0.829724
...,...,...
9999995,119147824,1.000000
9999996,119147825,1.000000
9999997,119147826,0.992126
9999998,119147827,0.992126


In [8]:
import pandas as pd

# Path to your BED file
bed_file_path = 'conserved_areas_BED'

# Initialize lists to store the BED columns
chromosomes = []
start_positions = []
end_positions = []
names = []

# Read the file line by line
with open(bed_file_path, 'r') as file:
    for line in file:
        # Skip track line and comment lines
        if line.startswith('track') or line.startswith('#'):
            continue
        
        if line.startswith('-------'):
            break

        # Process the BED data lines
        fields = line.strip().split('\t')
        chromosomes.append(fields[0])
        start_positions.append(int(fields[1]))
        end_positions.append(int(fields[2]))
        names.append(fields[3])

# Create a DataFrame from the collected data
bed_df = pd.DataFrame({
    'chromosome': chromosomes,
    'start': start_positions,
    'end': end_positions,
    'name': names
})

# Display the first few rows to verify the DataFrame
print(bed_df.head())


  chromosome  start    end    name
0       chr1  11701  11710  chr1.1
1       chr1  12007  12020  chr1.2
2       chr1  12021  12035  chr1.3
3       chr1  12039  12050  chr1.4
4       chr1  12064  12072  chr1.5


In [9]:
bed_df

Unnamed: 0,chromosome,start,end,name
0,chr1,11701,11710,chr1.1
1,chr1,12007,12020,chr1.2
2,chr1,12021,12035,chr1.3
3,chr1,12039,12050,chr1.4
4,chr1,12064,12072,chr1.5
...,...,...,...,...
9999995,chr10,7906106,7906110,chr10.29891
9999996,chr10,7906224,7906228,chr10.29892
9999997,chr10,7907427,7907429,chr10.29893
9999998,chr10,7908356,7908371,chr10.29894


In [11]:
# Initialize lists to store expanded positions and corresponding information
expanded_chromosomes = []
expanded_positions = []
expanded_names = []

# Expand each range in bed_df to individual positions
for idx, row in bed_df.iterrows():
    for pos in range(row['start'], row['end']):
        expanded_chromosomes.append(row['chromosome'])
        expanded_positions.append(pos)
        expanded_names.append(row['name'])

# Create a DataFrame with expanded positions
expanded_bed_df = pd.DataFrame({
    'chromosome': expanded_chromosomes,
    'position': expanded_positions,
    'name': expanded_names
})

# Merge the expanded BED positions with wig_df on 'position'
merged_df = pd.merge(expanded_bed_df, wig_df, on='position')

# Filter for only those rows where the positions match between expanded_bed_df and wig_df
# This will include 'chromosome', 'position', 'name', and 'score' columns
final_df = merged_df[['chromosome', 'position', 'score', 'name']]

# Display the first few rows of the result to verify
print(final_df.head())


  chromosome  position     score    name
0       chr1     11702  0.544299  chr1.1
1       chr1     11703  0.637228  chr1.1
2       chr1     11704  0.670417  chr1.1
3       chr1     11705  0.816449  chr1.1
4       chr1     11706  0.829724  chr1.1


In [18]:
final_df.to_csv("conservation_scores.csv", index=False)