In [115]:
filename="pcrna.fa"
# filename="lncRNA.fa"

# 1. Fickett_Score

In [116]:
ficket_arr=[]
def calculate_fickett_score(seq):
    """
    Calculates the Fickett score for a given DNA sequence.
    """
    num_a = seq.count('A')
    num_c = seq.count('C')
    num_g = seq.count('G')
    num_t = seq.count('T')
    
    if num_a + num_c + num_g + num_t == 0:
        return 0
    
    freq_a = num_a / (num_a + num_c + num_g + num_t)
    freq_c = num_c / (num_a + num_c + num_g + num_t)
    freq_g = num_g / (num_a + num_c + num_g + num_t)
    freq_t = num_t / (num_a + num_c + num_g + num_t)
    
    r_y_ratio = (freq_a + freq_g) / (freq_c + freq_t)
    a_t_ratio = freq_a / freq_t
    gc_content = freq_g + freq_c
    fickett_score = (r_y_ratio * a_t_ratio) + gc_content - 0.5
    
    return fickett_score

def read_fasta_file(filename):
    """
    Reads a FASTA file and returns a list of (header, sequence) tuples.
    """
    sequences = []
    with open(filename, 'r') as f:
        header = ''
        sequence = ''
        for line in f:
            if line.startswith('>'):
                if header != '':
                    sequences.append((header, sequence))
                header = line.strip()[1:]
                sequence = ''
            else:
                sequence += line.strip().upper()
        if header != '':
            sequences.append((header, sequence))
    return sequences

sequences = read_fasta_file(filename)

for header, sequence in sequences:
    fickett_score = calculate_fickett_score(sequence)
#     print(f'{header}\tFickett score: {fickett_score:.4f}')
    print(f'{fickett_score:.4f}')
    ficket_arr.append(fickett_score)


0.8532
0.4056
0.4056
1.3766
1.3794
1.5629
1.4147
1.4418
1.3507
1.3286
1.4558
1.3747
1.3571
1.2466
1.3373
1.0550
1.1389
1.3875
1.2255
1.2258
0.9779
1.2586
1.2530
1.1979
1.4667
1.2504
1.1785
2.0380
1.7655
1.8933
1.0338
1.0381
1.0243
1.0199
1.2620
1.0502
1.1439
1.1888
1.1778
1.0540
0.8844
0.9062
1.1836
1.1704
1.1662
1.0367
1.0238
0.9691
1.4686
1.4478
1.3455
1.4178
1.0890
1.1528
1.0455
1.2561
1.0082
1.3122
1.3603
1.3832
1.4516
1.2233
1.2538
1.2711
1.2677
1.5475
1.4069
1.3709
1.4368
1.4129
1.3979
1.8620
0.9244
1.0207
0.9772
0.9517
0.9279
1.3271
1.3650
1.3041
1.2406
1.1764
1.1623
1.3381
1.0673
0.9720
1.0381
1.0502
1.3313
1.1288
1.1718
1.0902
1.0867
1.0902
1.3093
1.1425
1.0201
1.1467
1.4567
1.8570
1.1044
1.1763
1.1135
1.3332
1.3668
1.3964
1.3779
1.3175
1.1159
1.4227
1.4085
1.4186
1.3930
1.5804
1.8913
1.3232
1.4591
1.5355
1.1530
1.1862
1.2015
1.1220
1.2108
1.1377
0.9378
1.0651
0.5989
1.2404
1.2134
1.0339
1.3124
1.3326
1.4223
1.3306
0.9472
0.9182
0.9322
0.9150
1.5297
1.5766
1.6160
1.5423
1.1651

In [117]:
ficket_arr

[0.8531940607002142,
 0.40559335077900616,
 0.40559335077900616,
 1.3766312397724225,
 1.3793723118589545,
 1.5628703121726377,
 1.4147350591713244,
 1.4417953154872503,
 1.3507343396564233,
 1.3285655379322148,
 1.4557681106793148,
 1.3747426609925388,
 1.3571035729485734,
 1.246578249010614,
 1.3372791881143997,
 1.054982909033313,
 1.138866954500108,
 1.3874992017852592,
 1.225534625747068,
 1.2257769058839874,
 0.9778927455941706,
 1.2585980000005375,
 1.253034115893482,
 1.197948177465395,
 1.4666656752438898,
 1.2504119770463595,
 1.1784607696151925,
 2.037958171977043,
 1.7654829986257043,
 1.8933089626173922,
 1.0338355041854301,
 1.0380563345488534,
 1.0242686130512975,
 1.019898568349242,
 1.261955624355005,
 1.0501510871591209,
 1.1438921417347145,
 1.188783815674772,
 1.1778000580662817,
 1.0540137753701422,
 0.8843839957214017,
 0.906169394236771,
 1.1836433236274884,
 1.1704496829657294,
 1.1662231269212686,
 1.0366544372030195,
 1.0238025055268976,
 0.9690816997766174,
 

# 2. CpG Islands

In [118]:
cpg_arr=[]

from Bio import SeqIO

def calculate_cpg_islands(seq):
    cpg_islands = []
    cpg_count = 0
    in_island = False
    
    for i in range(len(seq)-1):
        if seq[i:i+2].upper() == 'CG':
            cpg_count += 1
            if not in_island:
                in_island = True
        else:
            if cpg_count > 0:
                cpg_islands.append(cpg_count)
                cpg_count = 0
                in_island = False
    
    if cpg_count > 0:
        cpg_islands.append(cpg_count)
    
    return cpg_islands

# Open the FASTA file and loop over each record
for record in SeqIO.parse(filename, "fasta"):
    seq_id = record.id
    seq = record.seq
    
    # Calculate the number of CpG islands for this sequence
    cpg_islands = calculate_cpg_islands(seq)
    num_islands = len(cpg_islands)
    
    # Print the results
#     print(f'{seq_id}\t{num_islands}')
    print(f'\t{num_islands}')
    cpg_arr.append(num_islands)


	18
	8
	8
	302
	302
	29
	162
	123
	131
	141
	120
	137
	136
	110
	124
	199
	70
	145
	117
	131
	30
	152
	128
	124
	135
	114
	83
	50
	41
	37
	535
	540
	545
	540
	94
	139
	117
	127
	128
	181
	123
	73
	61
	69
	63
	24
	20
	30
	133
	135
	34
	99
	15
	62
	76
	35
	58
	75
	60
	63
	67
	69
	65
	54
	55
	58
	51
	61
	148
	166
	167
	44
	256
	85
	151
	147
	156
	70
	79
	79
	68
	73
	91
	38
	56
	54
	54
	66
	42
	53
	154
	114
	165
	148
	30
	145
	51
	258
	158
	34
	89
	79
	118
	109
	91
	102
	90
	92
	115
	90
	82
	103
	107
	86
	43
	58
	38
	33
	39
	37
	40
	51
	29
	27
	29
	19
	21
	135
	29
	181
	207
	271
	63
	59
	122
	157
	148
	181
	92
	59
	55
	70
	146
	95
	139
	103
	57
	53
	40
	32
	18
	35
	131
	48
	57
	117
	31
	139
	30
	216
	283
	31
	164
	23
	224
	143
	174
	178
	140
	143
	54
	123
	65
	52
	92
	142
	197
	266
	290
	292
	281
	291
	45
	301
	87
	273
	89
	302
	52
	260
	251
	228
	37
	118
	57
	118
	138
	126
	81
	68
	66
	32
	160
	148
	142
	125
	112
	135
	139
	138
	114
	246
	228
	146
	127
	121
	114
	126
	125
	128
	118
	25
	3

In [119]:
cpg_arr

[18,
 8,
 8,
 302,
 302,
 29,
 162,
 123,
 131,
 141,
 120,
 137,
 136,
 110,
 124,
 199,
 70,
 145,
 117,
 131,
 30,
 152,
 128,
 124,
 135,
 114,
 83,
 50,
 41,
 37,
 535,
 540,
 545,
 540,
 94,
 139,
 117,
 127,
 128,
 181,
 123,
 73,
 61,
 69,
 63,
 24,
 20,
 30,
 133,
 135,
 34,
 99,
 15,
 62,
 76,
 35,
 58,
 75,
 60,
 63,
 67,
 69,
 65,
 54,
 55,
 58,
 51,
 61,
 148,
 166,
 167,
 44,
 256,
 85,
 151,
 147,
 156,
 70,
 79,
 79,
 68,
 73,
 91,
 38,
 56,
 54,
 54,
 66,
 42,
 53,
 154,
 114,
 165,
 148,
 30,
 145,
 51,
 258,
 158,
 34,
 89,
 79,
 118,
 109,
 91,
 102,
 90,
 92,
 115,
 90,
 82,
 103,
 107,
 86,
 43,
 58,
 38,
 33,
 39,
 37,
 40,
 51,
 29,
 27,
 29,
 19,
 21,
 135,
 29,
 181,
 207,
 271,
 63,
 59,
 122,
 157,
 148,
 181,
 92,
 59,
 55,
 70,
 146,
 95,
 139,
 103,
 57,
 53,
 40,
 32,
 18,
 35,
 131,
 48,
 57,
 117,
 31,
 139,
 30,
 216,
 283,
 31,
 164,
 23,
 224,
 143,
 174,
 178,
 140,
 143,
 54,
 123,
 65,
 52,
 92,
 142,
 197,
 266,
 290,
 292,
 281,
 291,
 45,
 301

# 3. GC_Content

In [120]:
gc_arr=[]

def gc_content(seq):
    gc_count = seq.count('G') + seq.count('C')
    total_count = len(seq)
    return (gc_count / total_count) * 100

def fasta_to_gc_content(fasta_file):
    with open(fasta_file) as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    gc = gc_content(seq)
                    print(f'{gc:.2f}%\n')
                    gc_arr.append(gc)
                    seq = ''
                header = line.strip()
            else:
                seq += line.strip()
        if seq:
            gc = gc_content(seq)
#             print(f'Sequence: {header}\nGC content: {gc:.2f}%\n')
            print(f'{gc:.2f}%\n')
            gc_arr.append(gc)

fasta_to_gc_content(filename)


35.94%

46.01%

46.01%

69.93%

69.93%

64.34%

66.76%

68.47%

69.03%

69.06%

68.76%

69.08%

66.77%

69.79%

59.81%

64.98%

66.70%

64.58%

65.82%

66.15%

64.04%

65.57%

65.30%

66.29%

73.94%

73.79%

74.51%

63.41%

63.93%

64.21%

67.58%

67.53%

67.87%

67.95%

65.58%

65.66%

66.38%

66.42%

66.56%

66.37%

65.88%

63.63%

67.10%

66.03%

68.24%

66.43%

66.96%

63.56%

65.03%

64.98%

67.25%

62.72%

57.51%

68.63%

68.05%

65.61%

67.66%

69.70%

69.86%

69.30%

69.59%

68.98%

68.95%

69.17%

69.02%

68.60%

69.66%

69.46%

63.32%

63.73%

63.82%

60.16%

64.21%

69.21%

59.31%

59.51%

60.20%

59.51%

58.60%

60.55%

58.18%

60.31%

62.50%

59.75%

61.40%

61.92%

59.58%

66.41%

60.94%

63.07%

64.95%

64.59%

65.41%

66.50%

66.20%

66.18%

67.06%

65.59%

64.56%

61.86%

65.23%

68.07%

60.90%

60.89%

61.33%

60.69%

61.52%

61.72%

61.07%

61.18%

61.93%

61.34%

61.48%

60.30%

58.70%

61.08%

58.88%

60.28%

60.19%

60.00%

60.88%

58.51%

60.82%

62.84%

63.54%



In [121]:
gc_arr

[35.94346829640948,
 46.00638977635783,
 46.00638977635783,
 69.92784992784993,
 69.92502883506344,
 64.34108527131784,
 66.75791943684006,
 68.46689895470384,
 69.03225806451613,
 69.05807711078576,
 68.76122082585279,
 69.08358509566969,
 66.77316293929712,
 69.78625072212594,
 59.8113891911498,
 64.97857421114142,
 66.70317634173057,
 64.57808564231739,
 65.8158614402917,
 66.15071283095723,
 64.04011461318052,
 65.56818181818181,
 65.30477759472817,
 66.29258517034067,
 73.9423076923077,
 73.78531073446327,
 74.5127436281859,
 63.40533672172808,
 63.926940639269404,
 64.2072213500785,
 67.58121758121757,
 67.52772518258047,
 67.874816102715,
 67.95356371490281,
 65.57894736842104,
 65.66488266776452,
 66.37554585152839,
 66.4179104477612,
 66.55550262029539,
 66.3658991910392,
 65.87779690189329,
 63.62807657247037,
 67.10213776722091,
 66.02870813397129,
 68.24324324324324,
 66.4259927797834,
 66.95652173913044,
 63.55785837651122,
 65.03496503496503,
 64.97615951452103,
 67.25352

# 4. Hexamer Score

In [122]:
hexa_arr=[]

from collections import Counter

def hexamer_score(seq):
    hexamers = [seq[i:i+6] for i in range(len(seq)-5)]
    counts = Counter(hexamers)
    score = sum(counts.values())
    return score

def fasta_to_hexamer_score(fasta_file):
    with open(fasta_file) as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    score = hexamer_score(seq)
                    print(f'{score}\n')
                    hexa_arr.append(score)
                    seq = ''
                header = line.strip()
            else:
                seq += line.strip()
        if seq:
            score = hexamer_score(seq)
#             print(f'Sequence: {header}\nHexamer score: {score}\n')
            print(f'{score}\n')
            hexa_arr.append(score)

fasta_to_hexamer_score(filename)


2613

934

934

3460

3463

382

2552

1717

1855

2044

1666

1981

2186

1726

2752

2562

908

3171

2189

2450

693

3515

3030

2490

1035

880

662

782

652

632

7321

7389

7472

7403

1895

2424

1827

2005

2094

3209

2319

1092

837

831

735

549

570

574

2283

2302

563

2109

421

846

1078

722

700

1147

1007

1070

1179

1062

1148

806

873

941

954

997

1928

2074

2090

723

2800

1031

2450

2255

2382

1052

1251

1042

1016

1023

1051

467

881

809

762

638

525

745

3045

2263

3470

2726

640

2674

918

3788

2275

577

1252

872

2624

2258

1857

2109

1827

1863

2566

1793

1613

1834

1999

1519

811

1051

654

574

844

745

795

1099

480

544

549

579

574

2149

778

3470

2921

3300

780

773

2226

2288

1996

2723

1067

793

748

870

3107

2054

2810

1593

1181

656

543

695

583

1044

2019

1627

1971

1501

871

3212

814

2142

4487

453

3859

520

4093

2109

2476

2607

2261

2325

825

1376

712

791

1279

4171

2119

3029

In [123]:
hexa_arr

[2613,
 934,
 934,
 3460,
 3463,
 382,
 2552,
 1717,
 1855,
 2044,
 1666,
 1981,
 2186,
 1726,
 2752,
 2562,
 908,
 3171,
 2189,
 2450,
 693,
 3515,
 3030,
 2490,
 1035,
 880,
 662,
 782,
 652,
 632,
 7321,
 7389,
 7472,
 7403,
 1895,
 2424,
 1827,
 2005,
 2094,
 3209,
 2319,
 1092,
 837,
 831,
 735,
 549,
 570,
 574,
 2283,
 2302,
 563,
 2109,
 421,
 846,
 1078,
 722,
 700,
 1147,
 1007,
 1070,
 1179,
 1062,
 1148,
 806,
 873,
 941,
 954,
 997,
 1928,
 2074,
 2090,
 723,
 2800,
 1031,
 2450,
 2255,
 2382,
 1052,
 1251,
 1042,
 1016,
 1023,
 1051,
 467,
 881,
 809,
 762,
 638,
 525,
 745,
 3045,
 2263,
 3470,
 2726,
 640,
 2674,
 918,
 3788,
 2275,
 577,
 1252,
 872,
 2624,
 2258,
 1857,
 2109,
 1827,
 1863,
 2566,
 1793,
 1613,
 1834,
 1999,
 1519,
 811,
 1051,
 654,
 574,
 844,
 745,
 795,
 1099,
 480,
 544,
 549,
 579,
 574,
 2149,
 778,
 3470,
 2921,
 3300,
 780,
 773,
 2226,
 2288,
 1996,
 2723,
 1067,
 793,
 748,
 870,
 3107,
 2054,
 2810,
 1593,
 1181,
 656,
 543,
 695,
 583,
 1

# 5. ORF_Length

In [124]:
orf_arr=[]

def read_fasta_file(filename):
    """
    Reads a FASTA file and returns a list of (header, sequence) tuples.
    """
    sequences = []
    with open(filename, 'r') as f:
        header = ''
        sequence = ''
        for line in f:
            if line.startswith('>'):
                if header != '':
                    sequences.append((header, sequence))
                header = line.strip()[1:]
                sequence = ''
            else:
                sequence += line.strip().upper()
        if header != '':
            sequences.append((header, sequence))
    return sequences

def find_longest_orf(sequence):
    """
    Finds the longest ORF in a given DNA sequence.
    """
    longest_orf_length = 0
    for frame in range(3):
        for pos in range(frame, len(sequence)-2, 3):
            codon = sequence[pos:pos+3]
            if codon == 'ATG':
                orf_length = 0
                for pos2 in range(pos, len(sequence)-2, 3):
                    codon2 = sequence[pos2:pos2+3]
                    if codon2 in ('TAA', 'TAG', 'TGA'):
                        break
                    else:
                        orf_length += 3
                if orf_length > longest_orf_length:
                    longest_orf_length = orf_length
    return longest_orf_length

sequences = read_fasta_file(filename)

for header, sequence in sequences:
    orf_length = find_longest_orf(sequence)
#     print(f'{header}\tORF length: {orf_length}')
    print(f'{orf_length}')
    orf_arr.append(orf_length)

978
936
936
2532
2535
327
2043
1719
1857
2046
1668
1983
1752
1587
2247
1926
384
1833
1728
1989
387
2370
2028
2370
741
663
567
471
429
495
6135
6204
5889
5820
747
1140
594
567
390
762
1032
594
669
309
354
222
210
147
2019
2019
435
1212
243
765
723
507
702
894
1005
831
1065
609
909
567
759
702
933
975
1086
1044
762
720
987
906
672
777
621
621
621
777
825
558
588
180
513
237
300
303
213
129
2406
1680
1914
1914
417
2112
840
2502
2277
402
909
375
1818
1713
1506
1800
1506
1026
1818
1497
1281
1734
1713
834
519
1032
327
453
714
627
747
867
420
174
501
174
330
663
351
2556
2010
2085
480
750
1023
1326
1350
1299
597
597
597
597
1560
894
894
894
678
540
261
447
117
465
1197
261
522
1197
522
489
501
699
1335
444
1233
393
1944
1626
1758
1902
1227
1521
726
519
519
336
582
459
678
2562
2997
3039
2844
3027
342
2448
252
2259
525
2907
444
2757
1185
2457
267
384
498
1170
714
1158
513
510
459
228
2346
2385
2244
1695
1578
2316
2349
2355
738
1215
1215
2340
2238
2238
1260
2337
2310
2349
1221
453
111
1338
1338

In [125]:
orf_arr

[978,
 936,
 936,
 2532,
 2535,
 327,
 2043,
 1719,
 1857,
 2046,
 1668,
 1983,
 1752,
 1587,
 2247,
 1926,
 384,
 1833,
 1728,
 1989,
 387,
 2370,
 2028,
 2370,
 741,
 663,
 567,
 471,
 429,
 495,
 6135,
 6204,
 5889,
 5820,
 747,
 1140,
 594,
 567,
 390,
 762,
 1032,
 594,
 669,
 309,
 354,
 222,
 210,
 147,
 2019,
 2019,
 435,
 1212,
 243,
 765,
 723,
 507,
 702,
 894,
 1005,
 831,
 1065,
 609,
 909,
 567,
 759,
 702,
 933,
 975,
 1086,
 1044,
 762,
 720,
 987,
 906,
 672,
 777,
 621,
 621,
 621,
 777,
 825,
 558,
 588,
 180,
 513,
 237,
 300,
 303,
 213,
 129,
 2406,
 1680,
 1914,
 1914,
 417,
 2112,
 840,
 2502,
 2277,
 402,
 909,
 375,
 1818,
 1713,
 1506,
 1800,
 1506,
 1026,
 1818,
 1497,
 1281,
 1734,
 1713,
 834,
 519,
 1032,
 327,
 453,
 714,
 627,
 747,
 867,
 420,
 174,
 501,
 174,
 330,
 663,
 351,
 2556,
 2010,
 2085,
 480,
 750,
 1023,
 1326,
 1350,
 1299,
 597,
 597,
 597,
 597,
 1560,
 894,
 894,
 894,
 678,
 540,
 261,
 447,
 117,
 465,
 1197,
 261,
 522,
 1197,
 522

# 6. Transcript_length

In [126]:
trans_arr=[]

def transcript_length(seq):
    return len(seq)

def fasta_to_transcript_length(fasta_file):
    global length
    with open(fasta_file) as f:
        seq = ''
        for line in f:
            if line.startswith('>'):
                if seq:
                    length = transcript_length(seq)
                    print(f'{length}\n')
                    trans_arr.append(length)
                    seq = ''
                header = line.strip()
            else:
                seq += line.strip()
        if seq:
            length = transcript_length(seq)
#             print(f'Sequence: {header}\nTranscript length: {length}\n')
            print(f'{length}\n')
            trans_arr.append(length)
        return length
fasta_to_transcript_length(filename)


2618

939

939

3465

3468

387

2557

1722

1860

2049

1671

1986

2191

1731

2757

2567

913

3176

2194

2455

698

3520

3035

2495

1040

885

667

787

657

637

7326

7394

7477

7408

1900

2429

1832

2010

2099

3214

2324

1097

842

836

740

554

575

579

2288

2307

568

2114

426

851

1083

727

705

1152

1012

1075

1184

1067

1153

811

878

946

959

1002

1933

2079

2095

728

2805

1036

2455

2260

2387

1057

1256

1047

1021

1028

1056

472

886

814

767

643

530

750

3050

2268

3475

2731

645

2679

923

3793

2280

582

1257

877

2629

2263

1862

2114

1832

1868

2571

1798

1618

1839

2004

1524

816

1056

659

579

849

750

800

1104

485

549

554

584

579

2154

783

3475

2926

3305

785

778

2231

2293

2001

2728

1072

798

753

875

3112

2059

2815

1598

1186

661

548

700

588

1049

2024

1632

1976

1506

876

3217

819

2147

4492

458

3864

525

4098

2114

2481

2612

2266

2330

830

1381

717

796

1284

4176

2124

303

1107

In [127]:
trans_arr

[2618,
 939,
 939,
 3465,
 3468,
 387,
 2557,
 1722,
 1860,
 2049,
 1671,
 1986,
 2191,
 1731,
 2757,
 2567,
 913,
 3176,
 2194,
 2455,
 698,
 3520,
 3035,
 2495,
 1040,
 885,
 667,
 787,
 657,
 637,
 7326,
 7394,
 7477,
 7408,
 1900,
 2429,
 1832,
 2010,
 2099,
 3214,
 2324,
 1097,
 842,
 836,
 740,
 554,
 575,
 579,
 2288,
 2307,
 568,
 2114,
 426,
 851,
 1083,
 727,
 705,
 1152,
 1012,
 1075,
 1184,
 1067,
 1153,
 811,
 878,
 946,
 959,
 1002,
 1933,
 2079,
 2095,
 728,
 2805,
 1036,
 2455,
 2260,
 2387,
 1057,
 1256,
 1047,
 1021,
 1028,
 1056,
 472,
 886,
 814,
 767,
 643,
 530,
 750,
 3050,
 2268,
 3475,
 2731,
 645,
 2679,
 923,
 3793,
 2280,
 582,
 1257,
 877,
 2629,
 2263,
 1862,
 2114,
 1832,
 1868,
 2571,
 1798,
 1618,
 1839,
 2004,
 1524,
 816,
 1056,
 659,
 579,
 849,
 750,
 800,
 1104,
 485,
 549,
 554,
 584,
 579,
 2154,
 783,
 3475,
 2926,
 3305,
 785,
 778,
 2231,
 2293,
 2001,
 2728,
 1072,
 798,
 753,
 875,
 3112,
 2059,
 2815,
 1598,
 1186,
 661,
 548,
 700,
 588,
 

# 7. MFE score

In [128]:
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp
from Bio import SeqIO

In [129]:
mfe_arr=[] 
for seq_record in SeqIO.parse(filename, "fasta"):
    rna_seq = Seq(str(seq_record.seq))

    mfe = MeltingTemp.Tm_Wallace(rna_seq.transcribe().reverse_complement_rna())
    mfe_arr.append(mfe)
    print(mfe)

7118.0
2742.0
2742.0
11776.0
11786.0
1272.0
8528.0
5802.0
6288.0
6928.0
5640.0
6716.0
7308.0
5878.0
8812.0
8470.0
3044.0
10454.0
7276.0
8158.0
2290.0
11656.0
10034.0
8298.0
3618.0
3076.0
2328.0
2572.0
2154.0
2092.0
24554.0
24774.0
25104.0
24884.0
6292.0
8048.0
6096.0
6690.0
6992.0
10694.0
7710.0
3590.0
2814.0
2776.0
2490.0
1844.0
1920.0
1894.0
7552.0
7612.0
1900.0
6880.0
1342.0
2870.0
3640.0
2408.0
2364.0
3910.0
3438.0
3640.0
4016.0
3606.0
3896.0
2744.0
2968.0
3190.0
3254.0
3396.0
6314.0
6808.0
6864.0
2332.0
9212.0
3506.0
7822.0
7210.0
7648.0
3372.0
3984.0
3362.0
3230.0
3296.0
3432.0
1508.0
2860.0
2636.0
2448.0
2140.0
1706.0
2446.0
10062.0
7466.0
11496.0
9094.0
2144.0
8904.0
3084.0
12562.0
7504.0
1884.0
4154.0
2948.0
8460.0
7282.0
6008.0
6794.0
5918.0
6042.0
8282.0
5796.0
5240.0
5934.0
6472.0
4886.0
2590.0
3402.0
2094.0
1856.0
2720.0
2400.0
2574.0
3500.0
1560.0
1788.0
1812.0
1884.0
1860.0
7200.0
2516.0
11532.0
9684.0
11070.0
2614.0
2560.0
7364.0
7616.0
6652.0
9050.0
3600.0
2656.0
2488.

In [130]:
mfe_arr

[7118.0,
 2742.0,
 2742.0,
 11776.0,
 11786.0,
 1272.0,
 8528.0,
 5802.0,
 6288.0,
 6928.0,
 5640.0,
 6716.0,
 7308.0,
 5878.0,
 8812.0,
 8470.0,
 3044.0,
 10454.0,
 7276.0,
 8158.0,
 2290.0,
 11656.0,
 10034.0,
 8298.0,
 3618.0,
 3076.0,
 2328.0,
 2572.0,
 2154.0,
 2092.0,
 24554.0,
 24774.0,
 25104.0,
 24884.0,
 6292.0,
 8048.0,
 6096.0,
 6690.0,
 6992.0,
 10694.0,
 7710.0,
 3590.0,
 2814.0,
 2776.0,
 2490.0,
 1844.0,
 1920.0,
 1894.0,
 7552.0,
 7612.0,
 1900.0,
 6880.0,
 1342.0,
 2870.0,
 3640.0,
 2408.0,
 2364.0,
 3910.0,
 3438.0,
 3640.0,
 4016.0,
 3606.0,
 3896.0,
 2744.0,
 2968.0,
 3190.0,
 3254.0,
 3396.0,
 6314.0,
 6808.0,
 6864.0,
 2332.0,
 9212.0,
 3506.0,
 7822.0,
 7210.0,
 7648.0,
 3372.0,
 3984.0,
 3362.0,
 3230.0,
 3296.0,
 3432.0,
 1508.0,
 2860.0,
 2636.0,
 2448.0,
 2140.0,
 1706.0,
 2446.0,
 10062.0,
 7466.0,
 11496.0,
 9094.0,
 2144.0,
 8904.0,
 3084.0,
 12562.0,
 7504.0,
 1884.0,
 4154.0,
 2948.0,
 8460.0,
 7282.0,
 6008.0,
 6794.0,
 5918.0,
 6042.0,
 8282.0,
 5796.

# 8. Protine Coding Potential

# 9. Conservation Score

# Data set and Correlation

In [131]:
len(gc_arr)

309

In [267]:
import numpy as np
import pandas as pd

# coding_is_one=[1]*len(gc_arr)
# dataset = [orf_arr, gc_arr, trans_arr, cpg_arr, ficket_arr,coding_is_one]
# Excluded =[mfe_arr,hexa_arr]


# numpy_coding= np.array(dataset).T

# df_coding = pd.DataFrame(data=numpy_coding, columns=["ORF Length", "GC Content", "Transcript Length",
#                                             "CpG Island", "Ficket score","coding=1"])

# df_coding


# ----------------------------------------------------------------------------------------------------------

noncoding_is_zero=[0]*len(gc_arr)
dataset = [orf_arr, gc_arr, trans_arr, cpg_arr, ficket_arr,noncoding_is_zero]
Excluded =[mfe_arr,hexa_arr]

numpy_ncoding= np.array(dataset).T

df_ncoding = pd.DataFrame(data=numpy_ncoding, columns=["ORF Length", "GC Content", "Transcript Length",
                                            "CpG Island", "Ficket score","coding=1"])

df_ncoding

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,coding=1
0,978.0,35.943468,2618.0,18.0,0.853194,0.0
1,936.0,46.006390,939.0,8.0,0.405593,0.0
2,936.0,46.006390,939.0,8.0,0.405593,0.0
3,2532.0,69.927850,3465.0,302.0,1.376631,0.0
4,2535.0,69.925029,3468.0,302.0,1.379372,0.0
...,...,...,...,...,...,...
304,1050.0,61.266667,1500.0,94.0,1.591854,0.0
305,531.0,59.844668,4764.0,169.0,1.151051,0.0
306,600.0,62.924757,1648.0,96.0,1.276900,0.0
307,207.0,52.955665,406.0,20.0,1.248779,0.0


In [268]:
df_ncoding.corr()

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,coding=1
ORF Length,1.0,0.121184,0.790649,0.838709,0.176038,
GC Content,0.121184,1.0,-0.125599,0.244376,-0.054142,
Transcript Length,0.790649,-0.125599,1.0,0.864689,-0.045105,
CpG Island,0.838709,0.244376,0.864689,1.0,0.011126,
Ficket score,0.176038,-0.054142,-0.045105,0.011126,1.0,
coding=1,,,,,,


In [269]:
df_coding.corr()

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,coding=1
ORF Length,1.0,0.228575,0.430422,0.415983,0.000703,
GC Content,0.228575,1.0,-0.06918,0.478063,0.120408,
Transcript Length,0.430422,-0.06918,1.0,0.616171,-0.149289,
CpG Island,0.415983,0.478063,0.616171,1.0,-0.041362,
Ficket score,0.000703,0.120408,-0.149289,-0.041362,1.0,
coding=1,,,,,,


In [270]:
df_coding.head()

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,coding=1
0,228.0,55.340978,1657.0,23.0,0.650468,1.0
1,225.0,58.567416,712.0,32.0,0.785507,1.0
2,108.0,46.728972,535.0,11.0,0.645765,1.0
3,255.0,53.327717,1187.0,20.0,0.838515,1.0
4,222.0,53.559322,590.0,9.0,0.953618,1.0


In [271]:
df_ncoding.head()

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,coding=1
0,978.0,35.943468,2618.0,18.0,0.853194,0.0
1,936.0,46.00639,939.0,8.0,0.405593,0.0
2,936.0,46.00639,939.0,8.0,0.405593,0.0
3,2532.0,69.92785,3465.0,302.0,1.376631,0.0
4,2535.0,69.925029,3468.0,302.0,1.379372,0.0


In [272]:
numpy_ncoding

array([[9.78000000e+02, 3.59434683e+01, 2.61800000e+03, 1.80000000e+01,
        8.53194061e-01, 0.00000000e+00],
       [9.36000000e+02, 4.60063898e+01, 9.39000000e+02, 8.00000000e+00,
        4.05593351e-01, 0.00000000e+00],
       [9.36000000e+02, 4.60063898e+01, 9.39000000e+02, 8.00000000e+00,
        4.05593351e-01, 0.00000000e+00],
       ...,
       [6.00000000e+02, 6.29247573e+01, 1.64800000e+03, 9.60000000e+01,
        1.27690004e+00, 0.00000000e+00],
       [2.07000000e+02, 5.29556650e+01, 4.06000000e+02, 2.00000000e+01,
        1.24877862e+00, 0.00000000e+00],
       [3.60000000e+02, 5.63685637e+01, 1.10700000e+03, 6.90000000e+01,
        1.09216924e+00, 0.00000000e+00]])

In [273]:
import random


dataset_full=numpy_ncoding+numpy_coding[5:314,:]
random.shuffle(dataset_full)
dataset_full

array([[1.84800000e+03, 7.41330015e+01, 4.03200000e+03, 3.20000000e+01,
        1.38170324e+00, 1.00000000e+00],
       [1.17300000e+03, 9.20034786e+01, 3.68700000e+03, 3.80000000e+01,
        1.87417012e+00, 1.00000000e+00],
       [1.17300000e+03, 9.20034786e+01, 3.68700000e+03, 3.80000000e+01,
        1.87417012e+00, 1.00000000e+00],
       ...,
       [1.01100000e+03, 1.05840102e+02, 1.56700000e+03, 5.60000000e+01,
        3.65237039e+00, 1.00000000e+00],
       [1.98300000e+03, 1.27495841e+02, 4.42700000e+03, 3.45000000e+02,
        2.20684695e+00, 1.00000000e+00],
       [6.93000000e+02, 1.19067745e+02, 1.94600000e+03, 8.00000000e+01,
        2.53156661e+00, 1.00000000e+00]])

In [279]:
dataset_full.shape

(309, 6)

In [274]:
d= dataset_full

d_pd = pd.DataFrame(data=d, columns=["ORF Length", "GC Content", "Transcript Length",
                                            "CpG Island", "Ficket score","coding=1"])

d_pd

Unnamed: 0,ORF Length,GC Content,Transcript Length,CpG Island,Ficket score,coding=1
0,1848.0,74.133002,4032.0,32.0,1.381703,1.0
1,1173.0,92.003479,3687.0,38.0,1.874170,1.0
2,1173.0,92.003479,3687.0,38.0,1.874170,1.0
3,1191.0,82.869934,1430.0,11.0,2.666940,1.0
4,2748.0,119.440935,4191.0,314.0,2.842808,1.0
...,...,...,...,...,...,...
304,1338.0,107.673984,4016.0,129.0,3.445098,1.0
305,534.0,102.883467,1557.0,47.0,2.234123,1.0
306,1011.0,105.840102,1567.0,56.0,3.652370,1.0
307,1983.0,127.495841,4427.0,345.0,2.206847,1.0


In [277]:
d_pd['coding=1'].unique

<bound method Series.unique of 0       74.133002
1       92.003479
2       92.003479
3       82.869934
4      119.440935
          ...    
304    107.673984
305    102.883467
306    105.840102
307    127.495841
308    119.067745
Name: GC Content, Length: 309, dtype: float64>

In [239]:
import random 

a=[[1,3,4],[2,6,7],[3,8,9]]
b=[4,5,6]
c=a+b #[1, 2, 3, 4, 5, 6]
random.shuffle(c)

print(c)

[[3, 8, 9], 4, 6, [1, 3, 4], 5, [2, 6, 7]]


In [167]:
numpy_ncoding.shape

(309, 6)

In [170]:
numpy_coding[5:314,:].shape

(309, 6)

In [None]:
df_ncoding.corr()

In [280]:
arr1 = np.array([[1,2,3],[4,5,6]])
arr2 = np.array([[7,8,9],[10,11,12]])

res = np.concatenate((arr1,arr2))
print(res)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [284]:
# Create an array
x = np.arange(10)
print(x)
print()
# Shuffle the array
np.random.shuffle(x)

# Print the shuffled array
print(x)

[0 1 2 3 4 5 6 7 8 9]

[2 9 6 3 8 0 1 4 5 7]


# Logistic Regression model

In [32]:
# Simple code to learn multivariable regression in ml

import numpy as np
from sklearn.linear_model import LinearRegression

# Create some sample data
X = np.array([[1,2], [2,4], [3,6], [4,8]])
y = np.array([1,2,3,4])

# Create and fit the model
model = LinearRegression().fit(X, y)

# Make predictions
predictions = model.predict(X)

# Print the predictions
print(predictions)

[1. 2. 3. 4.]
