# 1. Fickett_Score

In [1]:
ficket_arr=[]
def calculate_fickett_score(seq):
    """
    Calculates the Fickett score for a given DNA sequence.
    """
    num_a = seq.count('A')
    num_c = seq.count('C')
    num_g = seq.count('G')
    num_t = seq.count('T')
    
    if num_a + num_c + num_g + num_t == 0:
        return 0
    
    freq_a = num_a / (num_a + num_c + num_g + num_t)
    freq_c = num_c / (num_a + num_c + num_g + num_t)
    freq_g = num_g / (num_a + num_c + num_g + num_t)
    freq_t = num_t / (num_a + num_c + num_g + num_t)
    
    r_y_ratio = (freq_a + freq_g) / (freq_c + freq_t)
    a_t_ratio = freq_a / freq_t
    gc_content = freq_g + freq_c
    fickett_score = (r_y_ratio * a_t_ratio) + gc_content - 0.5
    
    return fickett_score

def read_fasta_file(filename):
    """
    Reads a FASTA file and returns a list of (header, sequence) tuples.
    """
    sequences = []
    with open(filename, 'r') as f:
        header = ''
        sequence = ''
        for line in f:
            if line.startswith('>'):
                if header != '':
                    sequences.append((header, sequence))
                header = line.strip()[1:]
                sequence = ''
            else:
                sequence += line.strip().upper()
        if header != '':
            sequences.append((header, sequence))
    return sequences

filename = 'lncRNA.fa'
sequences = read_fasta_file(filename)

for header, sequence in sequences:
    fickett_score = calculate_fickett_score(sequence)
#     print(f'{header}\tFickett score: {fickett_score:.4f}')
    print(f'{fickett_score:.4f}')
    ficket_arr.append(fickett_score)


0.6505
0.7855
0.6458
0.8385
0.9536
0.5285
1.4686
2.2613
1.1588
1.4634
2.2717
1.4738
1.2076
0.6281
1.3107
0.8314
0.7838
0.7727
0.7763
1.5326
0.7945
0.7188
0.7278
0.6394
1.5655
1.9024
1.5527
2.6883
1.1699
1.2900
2.2517
0.5455
0.8300
0.8449
0.8427
1.0553
0.7166
1.0934
1.1518
0.9435
0.9611
0.8978
0.9332
0.7618
1.1563
1.4313
0.6806
0.9188
0.7261
0.6790
0.7737
1.4122
1.5959
1.2756
1.1780
1.1861
1.4449
1.1218
1.4918
1.0716
1.1217
1.1244
0.4371
1.1331
1.1090
1.1989
1.4894
1.6046
1.1241
1.4189
2.6710
2.2454
0.4953
1.7274
1.7827
1.5488
1.6443
1.6021
1.8982
2.0302
1.7560
1.6940
0.3925
1.3074
1.2746
1.5914
0.9380
0.9095
0.9364
1.1910
1.3274
1.0313
0.9851
1.1057
0.6389
1.0049
1.1060
0.9444
0.7670
1.0858
0.9651
1.1267
0.4198
1.0178
0.9848
0.5649
0.9167
1.0298
1.0375
1.1243
0.9955
0.7147
1.0151
1.1394
1.0098
0.9927
1.0213
0.9973
0.9341
1.0697
1.0212
1.1520
1.0274
1.0761
1.0313
1.0348
1.0028
0.6226
1.0706
1.0538
1.1131
0.9050
0.6260
0.8170
0.9219
0.9449
1.0844
1.3968
1.4643
2.4660
1.2636
1.5329
1.2320

In [2]:
ficket_arr

[0.6504675449144326,
 0.7855070943977613,
 0.6457645473026361,
 0.838514565856729,
 0.953617841416333,
 0.5285091804244331,
 1.468576771346222,
 2.2613462606882924,
 1.1588272790596754,
 1.4634357684388009,
 2.2716836734693877,
 1.473784579697663,
 1.2076186093213952,
 0.6281483324017765,
 1.3107217806041334,
 0.8314374082252458,
 0.7837501300514993,
 0.7726853482249165,
 0.7762795593366434,
 1.5325785649263475,
 0.7945074388578401,
 0.7188031011727807,
 0.7277755602440554,
 0.6393622265169037,
 1.565514490821354,
 1.902425149101389,
 1.5527061993945375,
 2.688258648493329,
 1.1699292270406845,
 1.2900107161161067,
 2.2517006802721093,
 0.545517594360418,
 0.8300045358556436,
 0.8449032853849943,
 0.84265896692806,
 1.0552596925873705,
 0.7165648631556003,
 1.0934487521159264,
 1.1517839721751448,
 0.9435068677360485,
 0.9610589237787144,
 0.8977840586656964,
 0.933189759465223,
 0.7618463371054198,
 1.1562854378997516,
 1.431330420895471,
 0.6805729228107427,
 0.9187602945682987,
 0.7

# 2. CpG Islands

In [3]:
cpg_arr=[]

from Bio import SeqIO

def calculate_cpg_islands(seq):
    cpg_islands = []
    cpg_count = 0
    in_island = False
    
    for i in range(len(seq)-1):
        if seq[i:i+2].upper() == 'CG':
            cpg_count += 1
            if not in_island:
                in_island = True
        else:
            if cpg_count > 0:
                cpg_islands.append(cpg_count)
                cpg_count = 0
                in_island = False
    
    if cpg_count > 0:
        cpg_islands.append(cpg_count)
    
    return cpg_islands

# Open the FASTA file and loop over each record
for record in SeqIO.parse("lncRNA.fa", "fasta"):
    seq_id = record.id
    seq = record.seq
    
    # Calculate the number of CpG islands for this sequence
    cpg_islands = calculate_cpg_islands(seq)
    num_islands = len(cpg_islands)
    
    # Print the results
#     print(f'{seq_id}\t{num_islands}')
    print(f'\t{num_islands}')
    cpg_arr.append(num_islands)


	23
	32
	11
	20
	9
	14
	30
	3
	5
	12
	9
	16
	4
	40
	2
	29
	17
	6
	34
	0
	6
	17
	14
	22
	7
	11
	10
	11
	13
	10
	9
	4
	19
	13
	14
	17
	12
	11
	7
	11
	10
	13
	10
	5
	2
	2
	3
	1
	15
	10
	189
	36
	2
	52
	47
	46
	43
	41
	38
	35
	40
	37
	31
	36
	39
	40
	36
	23
	20
	28
	4
	5
	3
	2
	6
	9
	9
	11
	11
	10
	6
	14
	5
	23
	32
	14
	49
	78
	9
	51
	46
	169
	109
	211
	48
	100
	153
	36
	78
	152
	75
	113
	24
	60
	70
	29
	64
	70
	69
	55
	27
	74
	58
	59
	70
	99
	87
	96
	61
	53
	57
	57
	30
	64
	98
	60
	59
	25
	53
	17
	48
	25
	64
	39
	21
	19
	4
	24
	82
	13
	134
	47
	22
	52
	7
	18
	52
	41
	30
	35
	45
	43
	56
	37
	19
	15
	33
	18
	82
	39
	163
	51
	15
	18
	8
	59
	59
	101
	96
	94
	95
	52
	52
	59
	28
	66
	67
	65
	16
	33
	60
	28
	21
	22
	33
	30
	23
	70
	92
	81
	132
	17
	209
	103
	41
	94
	44
	37
	23
	42
	52
	64
	37
	11
	25
	134
	33
	21
	11
	193
	280
	23
	5
	14
	31
	109
	83
	55
	21
	136
	35
	136
	26
	54
	12
	19
	7
	4
	19
	11
	25
	15
	26
	34
	33
	55
	118
	7
	27
	261
	16
	50
	26
	51
	53
	12
	108
	78
	40
	78
	1
	66
	84
	5

	31
	10
	87
	0
	2
	19
	14
	8
	3
	11
	13
	5
	6
	26
	25
	39
	48
	36
	24
	35
	3
	31
	3
	5
	6
	5
	101
	55
	49
	10
	41
	9
	4
	2
	5
	1
	5
	5
	14
	37
	9
	24
	12
	93
	12
	21
	51
	47
	23
	60
	41
	168
	43
	54
	36
	61
	25
	16
	19
	13
	4
	4
	10
	37
	29
	14
	31
	5
	1
	28
	4
	19
	35
	32
	23
	18
	18
	7
	3
	12
	27
	3
	14
	16
	16
	16
	17
	16
	16
	4
	24
	20
	21
	39
	44
	37
	40
	50
	30
	36
	27
	64
	34
	30
	23
	16
	16
	8
	32
	4
	13
	7
	4
	4
	9
	11
	2
	56
	9
	3
	7
	24
	10
	4
	7
	5
	2
	9
	9
	17
	14
	14
	46
	31
	17
	45
	43
	43
	62
	31
	120
	9
	10
	16
	13
	10
	19
	21
	18
	25
	20
	39
	4
	4
	9
	4
	16
	13
	16
	13
	36
	46
	85
	84
	28
	60
	37
	35
	50
	20
	5
	12
	23
	7
	16
	20
	14
	17
	17
	11
	14
	16
	7
	11
	17
	16
	9
	7
	17
	16
	14
	17
	3
	67
	42
	9
	21
	9
	62
	58
	43
	35
	27
	8
	9
	5
	76
	49
	54
	47
	76
	44
	2
	8
	7
	57
	181
	40
	72
	42
	10
	50
	79
	15
	8
	5
	7
	32
	10
	29
	19
	29
	21
	18
	15
	12
	12
	113
	28
	66
	72
	71
	26
	11
	5
	85
	86
	18
	13
	72
	63
	52
	6
	27
	47
	26
	25
	40
	29
	33
	41
	29
	20
	18
	33
	33

In [4]:
cpg_arr

[23,
 32,
 11,
 20,
 9,
 14,
 30,
 3,
 5,
 12,
 9,
 16,
 4,
 40,
 2,
 29,
 17,
 6,
 34,
 0,
 6,
 17,
 14,
 22,
 7,
 11,
 10,
 11,
 13,
 10,
 9,
 4,
 19,
 13,
 14,
 17,
 12,
 11,
 7,
 11,
 10,
 13,
 10,
 5,
 2,
 2,
 3,
 1,
 15,
 10,
 189,
 36,
 2,
 52,
 47,
 46,
 43,
 41,
 38,
 35,
 40,
 37,
 31,
 36,
 39,
 40,
 36,
 23,
 20,
 28,
 4,
 5,
 3,
 2,
 6,
 9,
 9,
 11,
 11,
 10,
 6,
 14,
 5,
 23,
 32,
 14,
 49,
 78,
 9,
 51,
 46,
 169,
 109,
 211,
 48,
 100,
 153,
 36,
 78,
 152,
 75,
 113,
 24,
 60,
 70,
 29,
 64,
 70,
 69,
 55,
 27,
 74,
 58,
 59,
 70,
 99,
 87,
 96,
 61,
 53,
 57,
 57,
 30,
 64,
 98,
 60,
 59,
 25,
 53,
 17,
 48,
 25,
 64,
 39,
 21,
 19,
 4,
 24,
 82,
 13,
 134,
 47,
 22,
 52,
 7,
 18,
 52,
 41,
 30,
 35,
 45,
 43,
 56,
 37,
 19,
 15,
 33,
 18,
 82,
 39,
 163,
 51,
 15,
 18,
 8,
 59,
 59,
 101,
 96,
 94,
 95,
 52,
 52,
 59,
 28,
 66,
 67,
 65,
 16,
 33,
 60,
 28,
 21,
 22,
 33,
 30,
 23,
 70,
 92,
 81,
 132,
 17,
 209,
 103,
 41,
 94,
 44,
 37,
 23,
 42,
 52,
 64,
 37,
 11

# 3. GC_Content

In [7]:
gc_arr=[]

def gc_content(seq):
    gc_count = seq.count('G') + seq.count('C')
    total_count = len(seq)
    return (gc_count / total_count) * 100

def fasta_to_gc_content(fasta_file):
    with open(fasta_file) as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    gc = gc_content(seq)
                    print(f'{gc:.2f}%\n')
                    gc_arr.append(gc)
                    seq = ''
                header = line.strip()
            else:
                seq += line.strip()
        if seq:
            gc = gc_content(seq)
#             print(f'Sequence: {header}\nGC content: {gc:.2f}%\n')
            print(f'{gc:.2f}%\n')
            gc_arr.append(gc)

fasta_to_gc_content('lncRNA.fa')


55.34%

58.57%

46.73%

53.33%

53.56%

38.19%

46.00%

36.86%

44.52%

49.52%

54.46%

47.46%

53.87%

45.00%

48.65%

48.96%

39.59%

45.68%

47.19%

40.04%

46.33%

48.43%

49.30%

43.76%

39.59%

43.98%

54.11%

50.12%

38.36%

46.46%

54.76%

47.71%

56.24%

58.89%

54.18%

54.12%

54.49%

52.36%

48.55%

55.17%

53.21%

53.40%

52.87%

49.32%

48.18%

48.41%

44.89%

44.30%

48.11%

48.26%

57.86%

54.64%

40.13%

46.55%

52.06%

51.81%

49.88%

52.41%

50.00%

53.36%

51.15%

51.16%

43.41%

52.17%

51.11%

51.12%

47.91%

48.21%

48.28%

47.32%

37.95%

36.18%

40.13%

49.40%

52.54%

51.52%

46.92%

42.97%

51.52%

50.70%

51.89%

49.62%

37.91%

41.83%

42.08%

42.43%

50.28%

44.38%

42.54%

47.35%

47.33%

48.02%

50.16%

50.15%

49.30%

50.23%

48.25%

44.63%

46.61%

47.52%

49.46%

49.07%

49.34%

48.84%

50.90%

53.87%

48.68%

48.54%

48.84%

47.89%

52.47%

47.71%

50.88%

52.03%

48.50%

49.85%

49.47%

49.74%

47.98%

47.84%

47.53%

48.56%

52.99%

48.27%

50.05%



In [9]:
gc_arr

[55.34097767048883,
 58.56741573033708,
 46.728971962616825,
 53.32771693344566,
 53.559322033898304,
 38.189533239038184,
 45.99708879184862,
 36.86354378818738,
 44.51510333863275,
 49.51590594744122,
 54.46428571428571,
 47.460197119029566,
 53.86996904024768,
 45.0,
 48.64864864864865,
 48.962336664104534,
 39.585870889159565,
 45.68345323741007,
 47.192839707078925,
 40.0437636761488,
 46.33152173913043,
 48.43492586490939,
 49.29577464788733,
 43.7625754527163,
 39.588100686498855,
 43.97590361445783,
 54.112554112554115,
 50.12106537530266,
 38.35920177383592,
 46.45669291338583,
 54.761904761904766,
 47.714808043875685,
 56.242118537200504,
 58.88501742160279,
 54.17661097852029,
 54.12087912087912,
 54.48577680525164,
 52.364273204903675,
 48.55305466237942,
 55.172413793103445,
 53.21100917431193,
 53.39966832504146,
 52.8695652173913,
 49.31506849315068,
 48.17708333333333,
 48.41437632135307,
 44.89402697495183,
 44.29824561403509,
 48.105436573311366,
 48.258706467661696,


# 4. Hexamer Score

In [10]:
hexa_arr=[]

from collections import Counter

def hexamer_score(seq):
    hexamers = [seq[i:i+6] for i in range(len(seq)-5)]
    counts = Counter(hexamers)
    score = sum(counts.values())
    return score

def fasta_to_hexamer_score(fasta_file):
    with open(fasta_file) as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    score = hexamer_score(seq)
                    print(f'{score}\n')
                    hexa_arr.append(score)
                    seq = ''
                header = line.strip()
            else:
                seq += line.strip()
        if seq:
            score = hexamer_score(seq)
#             print(f'Sequence: {header}\nHexamer score: {score}\n')
            print(f'{score}\n')
            hexa_arr.append(score)

fasta_to_hexamer_score('lncRNA.fa')


1652

707

530

1182

585

1409

2743

486

624

718

331

1314

318

4855

513

1296

816

273

1224

452

731

602

421

989

432

493

457

408

897

630

331

542

788

569

833

723

452

566

617

691

540

598

570

433

379

468

514

451

602

397

6464

932

451

1864

1353

1373

1196

1180

839

561

1209

1162

1073

1080

1171

1286

1141

804

664

1770

385

603

593

163

389

721

775

1019

688

709

602

1035

1868

2187

2944

867

1942

3307

672

1598

1341

6611

5436

8384

2060

3089

5328

1747

2252

5481

1851

3698

524

1416

1655

628

1548

1944

1854

1396

906

2376

1243

1371

1927

3092

2928

3087

1558

1427

1596

1455

982

1646

3092

1553

1210

719

2490

889

1195

840

2504

2065

1802

499

441

346

3038

438

4147

1384

625

2081

466

397

1147

952

629

817

1038

977

1283

834

535

560

824

1090

1615

982

3527

1553

263

384

381

1754

1758

2524

2222

2165

2191

1670

1668

2001

510

1831

1838

1863

853

1518

538

505

In [12]:
hexa_arr

[1652,
 707,
 530,
 1182,
 585,
 1409,
 2743,
 486,
 624,
 718,
 331,
 1314,
 318,
 4855,
 513,
 1296,
 816,
 273,
 1224,
 452,
 731,
 602,
 421,
 989,
 432,
 493,
 457,
 408,
 897,
 630,
 331,
 542,
 788,
 569,
 833,
 723,
 452,
 566,
 617,
 691,
 540,
 598,
 570,
 433,
 379,
 468,
 514,
 451,
 602,
 397,
 6464,
 932,
 451,
 1864,
 1353,
 1373,
 1196,
 1180,
 839,
 561,
 1209,
 1162,
 1073,
 1080,
 1171,
 1286,
 1141,
 804,
 664,
 1770,
 385,
 603,
 593,
 163,
 389,
 721,
 775,
 1019,
 688,
 709,
 602,
 1035,
 1868,
 2187,
 2944,
 867,
 1942,
 3307,
 672,
 1598,
 1341,
 6611,
 5436,
 8384,
 2060,
 3089,
 5328,
 1747,
 2252,
 5481,
 1851,
 3698,
 524,
 1416,
 1655,
 628,
 1548,
 1944,
 1854,
 1396,
 906,
 2376,
 1243,
 1371,
 1927,
 3092,
 2928,
 3087,
 1558,
 1427,
 1596,
 1455,
 982,
 1646,
 3092,
 1553,
 1210,
 719,
 2490,
 889,
 1195,
 840,
 2504,
 2065,
 1802,
 499,
 441,
 346,
 3038,
 438,
 4147,
 1384,
 625,
 2081,
 466,
 397,
 1147,
 952,
 629,
 817,
 1038,
 977,
 1283,
 834,
 

# 5. ORF_Length

In [13]:
orf_arr=[]

def read_fasta_file(filename):
    """
    Reads a FASTA file and returns a list of (header, sequence) tuples.
    """
    sequences = []
    with open(filename, 'r') as f:
        header = ''
        sequence = ''
        for line in f:
            if line.startswith('>'):
                if header != '':
                    sequences.append((header, sequence))
                header = line.strip()[1:]
                sequence = ''
            else:
                sequence += line.strip().upper()
        if header != '':
            sequences.append((header, sequence))
    return sequences

def find_longest_orf(sequence):
    """
    Finds the longest ORF in a given DNA sequence.
    """
    longest_orf_length = 0
    for frame in range(3):
        for pos in range(frame, len(sequence)-2, 3):
            codon = sequence[pos:pos+3]
            if codon == 'ATG':
                orf_length = 0
                for pos2 in range(pos, len(sequence)-2, 3):
                    codon2 = sequence[pos2:pos2+3]
                    if codon2 in ('TAA', 'TAG', 'TGA'):
                        break
                    else:
                        orf_length += 3
                if orf_length > longest_orf_length:
                    longest_orf_length = orf_length
    return longest_orf_length

filename = 'lncRNA.fa'
sequences = read_fasta_file(filename)

for header, sequence in sequences:
    orf_length = find_longest_orf(sequence)
#     print(f'{header}\tORF length: {orf_length}')
    print(f'{orf_length}')
    orf_arr.append(orf_length)

228
225
108
255
222
870
237
255
255
213
114
159
180
270
120
390
279
93
279
99
195
186
186
186
153
141
72
168
255
183
114
198
438
426
291
228
309
258
279
237
291
150
180
150
222
135
207
132
186
186
621
171
138
279
258
258
258
363
144
144
258
258
153
354
258
258
258
144
144
258
63
78
123
12
84
258
258
258
258
258
354
258
504
279
279
159
477
489
141
87
87
306
366
366
168
231
273
189
273
372
135
372
72
87
141
273
273
189
321
87
147
174
273
168
141
231
231
372
273
87
273
87
147
273
273
273
153
75
366
147
273
114
414
351
288
168
138
0
369
153
441
384
177
627
48
294
162
96
96
96
96
96
528
96
84
96
309
246
411
483
858
144
87
300
30
264
411
246
246
246
246
357
246
264
186
246
246
246
246
171
87
273
69
312
255
294
102
282
264
411
141
51
1089
738
222
798
903
741
417
360
564
147
141
222
144
489
789
420
243
603
603
288
165
282
255
426
426
354
330
522
195
579
309
432
276
57
135
66
180
174
126
162
324
297
87
405
444
132
261
444
405
261
3
471
60
216
300
252
390
378
351
309
645
309
330
318
234
351
261


216
90
318
318
123
159
204
177
117
150
150
117
177
117
117
75
75
75
117
126
177
315
129
93
219
117
306
315
306
315
168
123
123
312
249
123
123
204
9
123
249
123
249
249
249
189
249
126
249
249
249
249
249
153
222
213
105
153
177
222
207
156
177
201
156
213
84
138
192
129
141
129
108
135
171
96
78
165
171
183
165
117
165
183
567
204
276
720
1053
276
195
231
162
135
135
135
162
138
132
216
54
363
291
126
99
186
279
453
216
147
135
102
318
459
96
84
204
294
54
327
222
102
183
51
558
180
156
180
237
180
237
156
222
237
222
222
126
180
156
177
192
237
237
192
189
177
234
204
15
171
129
222
237
312
180
237
237
360
309
426
348
225
159
159
159
159
159
159
126
120
327
117
117
93
300
186
177
42
129
129
243
744
192
396
204
189
36
177
624
492
375
135
303
120
156
228
264
342
141
240
240
84
528
126
318
207
108
306
204
225
276
375
114
351
177
177
177
276
177
276
300
210
204
150
159
153
507
153
150
153
153
111
225
225
225
225
225
225
225
369
285
369
339
30
156
180
225
156
51
270
51
90
297
138
231
141


In [14]:
orf_arr

[228,
 225,
 108,
 255,
 222,
 870,
 237,
 255,
 255,
 213,
 114,
 159,
 180,
 270,
 120,
 390,
 279,
 93,
 279,
 99,
 195,
 186,
 186,
 186,
 153,
 141,
 72,
 168,
 255,
 183,
 114,
 198,
 438,
 426,
 291,
 228,
 309,
 258,
 279,
 237,
 291,
 150,
 180,
 150,
 222,
 135,
 207,
 132,
 186,
 186,
 621,
 171,
 138,
 279,
 258,
 258,
 258,
 363,
 144,
 144,
 258,
 258,
 153,
 354,
 258,
 258,
 258,
 144,
 144,
 258,
 63,
 78,
 123,
 12,
 84,
 258,
 258,
 258,
 258,
 258,
 354,
 258,
 504,
 279,
 279,
 159,
 477,
 489,
 141,
 87,
 87,
 306,
 366,
 366,
 168,
 231,
 273,
 189,
 273,
 372,
 135,
 372,
 72,
 87,
 141,
 273,
 273,
 189,
 321,
 87,
 147,
 174,
 273,
 168,
 141,
 231,
 231,
 372,
 273,
 87,
 273,
 87,
 147,
 273,
 273,
 273,
 153,
 75,
 366,
 147,
 273,
 114,
 414,
 351,
 288,
 168,
 138,
 0,
 369,
 153,
 441,
 384,
 177,
 627,
 48,
 294,
 162,
 96,
 96,
 96,
 96,
 96,
 528,
 96,
 84,
 96,
 309,
 246,
 411,
 483,
 858,
 144,
 87,
 300,
 30,
 264,
 411,
 246,
 246,
 246,
 246,
 3

# 6. Transcript_length

In [15]:
trans_arr=[]

def transcript_length(seq):
    return len(seq)

def fasta_to_transcript_length(fasta_file):
    global length
    with open(fasta_file) as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    length = transcript_length(seq)
                    print(f'{length}\n')
                    trans_arr.append(length)
                    seq = ''
                header = line.strip()
            else:
                seq += line.strip()
        if seq:
            length = transcript_length(seq)
#             print(f'Sequence: {header}\nTranscript length: {length}\n')
            print(f'{length}\n')
            trans_arr.append(length)
        return length
fasta_to_transcript_length('lncRNA.fa')


1657

712

535

1187

590

1414

2748

491

629

723

336

1319

323

4860

518

1301

821

278

1229

457

736

607

426

994

437

498

462

413

902

635

336

547

793

574

838

728

457

571

622

696

545

603

575

438

384

473

519

456

607

402

6469

937

456

1869

1358

1378

1201

1185

844

566

1214

1167

1078

1085

1176

1291

1146

809

669

1775

390

608

598

168

394

726

780

1024

693

714

607

1040

1873

2192

2949

872

1947

3312

677

1603

1346

6616

5441

8389

2065

3094

5333

1752

2257

5486

1856

3703

529

1421

1660

633

1553

1949

1859

1401

911

2381

1248

1376

1932

3097

2933

3092

1563

1432

1601

1460

987

1651

3097

1558

1215

724

2495

894

1200

845

2509

2070

1807

504

446

351

3043

443

4152

1389

630

2086

471

402

1152

957

634

822

1043

982

1288

839

540

565

829

1095

1620

987

3532

1558

268

389

386

1759

1763

2529

2227

2170

2196

1675

1673

2006

515

1836

1843

1868

858

1523

543

510

638

In [16]:
trans_arr

[1657,
 712,
 535,
 1187,
 590,
 1414,
 2748,
 491,
 629,
 723,
 336,
 1319,
 323,
 4860,
 518,
 1301,
 821,
 278,
 1229,
 457,
 736,
 607,
 426,
 994,
 437,
 498,
 462,
 413,
 902,
 635,
 336,
 547,
 793,
 574,
 838,
 728,
 457,
 571,
 622,
 696,
 545,
 603,
 575,
 438,
 384,
 473,
 519,
 456,
 607,
 402,
 6469,
 937,
 456,
 1869,
 1358,
 1378,
 1201,
 1185,
 844,
 566,
 1214,
 1167,
 1078,
 1085,
 1176,
 1291,
 1146,
 809,
 669,
 1775,
 390,
 608,
 598,
 168,
 394,
 726,
 780,
 1024,
 693,
 714,
 607,
 1040,
 1873,
 2192,
 2949,
 872,
 1947,
 3312,
 677,
 1603,
 1346,
 6616,
 5441,
 8389,
 2065,
 3094,
 5333,
 1752,
 2257,
 5486,
 1856,
 3703,
 529,
 1421,
 1660,
 633,
 1553,
 1949,
 1859,
 1401,
 911,
 2381,
 1248,
 1376,
 1932,
 3097,
 2933,
 3092,
 1563,
 1432,
 1601,
 1460,
 987,
 1651,
 3097,
 1558,
 1215,
 724,
 2495,
 894,
 1200,
 845,
 2509,
 2070,
 1807,
 504,
 446,
 351,
 3043,
 443,
 4152,
 1389,
 630,
 2086,
 471,
 402,
 1152,
 957,
 634,
 822,
 1043,
 982,
 1288,
 839,
 

# 7. MFE score

In [25]:
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp
from Bio import SeqIO

In [26]:
mfe_arr=[] 
for seq_record in SeqIO.parse("lncRNA.fa", "fasta"):
    rna_seq = Seq(str(seq_record.seq))

    mfe = MeltingTemp.Tm_Wallace(rna_seq.transcribe().reverse_complement_rna())
    mfe_arr.append(mfe)
    print(mfe)

5148.0
2258.0
1570.0
3640.0
1812.0
3908.0
8024.0
1344.0
1818.0
2162.0
1038.0
3890.0
994.0
14094.0
1540.0
3876.0
2292.0
810.0
3618.0
1280.0
2154.0
1802.0
1272.0
2858.0
1220.0
1434.0
1424.0
1240.0
2496.0
1860.0
1040.0
1616.0
2478.0
1824.0
2584.0
2244.0
1412.0
1740.0
1848.0
2160.0
1670.0
1850.0
1758.0
1308.0
1138.0
1404.0
1504.0
1316.0
1798.0
1192.0
20424.0
2898.0
1278.0
5478.0
4130.0
4184.0
3600.0
3612.0
2532.0
1736.0
3670.0
3528.0
3092.0
3302.0
3554.0
3902.0
3390.0
2398.0
1984.0
5230.0
1076.0
1656.0
1676.0
502.0
1202.0
2200.0
2292.0
2928.0
2100.0
2152.0
1844.0
3112.0
5166.0
6218.0
8380.0
2484.0
5852.0
9564.0
1930.0
4724.0
3966.0
19586.0
16340.0
25192.0
6166.0
9296.0
15812.0
5068.0
6618.0
16186.0
5548.0
11040.0
1580.0
4230.0
5010.0
1948.0
4618.0
5790.0
5534.0
4144.0
2778.0
7034.0
3766.0
4184.0
5738.0
9282.0
8768.0
9260.0
4626.0
4234.0
4724.0
4338.0
3020.0
4896.0
9294.0
4614.0
3794.0
2154.0
7570.0
2672.0
3570.0
2530.0
7704.0
6258.0
5322.0
1474.0
1300.0
1142.0
9718.0
1418.0
13426.0
4516.0


In [27]:
mfe_arr

[5148.0,
 2258.0,
 1570.0,
 3640.0,
 1812.0,
 3908.0,
 8024.0,
 1344.0,
 1818.0,
 2162.0,
 1038.0,
 3890.0,
 994.0,
 14094.0,
 1540.0,
 3876.0,
 2292.0,
 810.0,
 3618.0,
 1280.0,
 2154.0,
 1802.0,
 1272.0,
 2858.0,
 1220.0,
 1434.0,
 1424.0,
 1240.0,
 2496.0,
 1860.0,
 1040.0,
 1616.0,
 2478.0,
 1824.0,
 2584.0,
 2244.0,
 1412.0,
 1740.0,
 1848.0,
 2160.0,
 1670.0,
 1850.0,
 1758.0,
 1308.0,
 1138.0,
 1404.0,
 1504.0,
 1316.0,
 1798.0,
 1192.0,
 20424.0,
 2898.0,
 1278.0,
 5478.0,
 4130.0,
 4184.0,
 3600.0,
 3612.0,
 2532.0,
 1736.0,
 3670.0,
 3528.0,
 3092.0,
 3302.0,
 3554.0,
 3902.0,
 3390.0,
 2398.0,
 1984.0,
 5230.0,
 1076.0,
 1656.0,
 1676.0,
 502.0,
 1202.0,
 2200.0,
 2292.0,
 2928.0,
 2100.0,
 2152.0,
 1844.0,
 3112.0,
 5166.0,
 6218.0,
 8380.0,
 2484.0,
 5852.0,
 9564.0,
 1930.0,
 4724.0,
 3966.0,
 19586.0,
 16340.0,
 25192.0,
 6166.0,
 9296.0,
 15812.0,
 5068.0,
 6618.0,
 16186.0,
 5548.0,
 11040.0,
 1580.0,
 4230.0,
 5010.0,
 1948.0,
 4618.0,
 5790.0,
 5534.0,
 4144.0,
 2778

# 8. Protine Coding Potential

# 9. Conservation Score

# Data set and Correlation

In [28]:
import numpy as np
import pandas as pd

dataset = [orf_arr, gc_arr, trans_arr, cpg_arr, ficket_arr]
Excluded =[mfe_arr,hexa_arr]
numpy_data= np.array(dataset).T

df = pd.DataFrame(data=numpy_data, columns=["ORF Length", "GC Content", "Transcript Length",
                                            "CpG Island", "Ficket score"])

df

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,MFE Score
0,228.0,55.340978,1657.0,23.0,0.650468,5148.0
1,225.0,58.567416,712.0,32.0,0.785507,2258.0
2,108.0,46.728972,535.0,11.0,0.645765,1570.0
3,255.0,53.327717,1187.0,20.0,0.838515,3640.0
4,222.0,53.559322,590.0,9.0,0.953618,1812.0
...,...,...,...,...,...,...
3823,264.0,45.504950,2525.0,48.0,1.443093,7348.0
3824,264.0,45.326504,2343.0,46.0,1.416213,6810.0
3825,264.0,46.884666,2263.0,46.0,1.367065,6648.0
3826,264.0,45.733386,2543.0,49.0,1.463643,7412.0


In [29]:
df.corr()

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,MFE Score
ORF Length,1.0,0.228575,0.430422,0.415983,0.000703,0.452565
GC Content,0.228575,1.0,-0.06918,0.478063,0.120408,-0.008346
Transcript Length,0.430422,-0.06918,1.0,0.616171,-0.149289,0.996834
CpG Island,0.415983,0.478063,0.616171,1.0,-0.041362,0.663042
Ficket score,0.000703,0.120408,-0.149289,-0.041362,1.0,-0.141214
MFE Score,0.452565,-0.008346,0.996834,0.663042,-0.141214,1.0
