##### 📄 License
- This notebook is licensed under the [MIT License](https://opensource.org/licenses/MIT).  
- © 2025 Surbhi-CodeLab For more, see the [LICENSE](https://github.com/Surbhi-CodeLab/DNA-G-C-Content-Analyzer/blob/main/LICENSE) file.
- Please cite or credit if using or modifying.

<h1><center><b> DNA G-C Content Analysis </center></b>

- GC Content refers to the percentage of nitrogenous bases in a DNA or RNA sequence that are either Guanine (G) or Cytosine (C).

- The GC content is calculated using the formula:

      [ (G+C)/(A+T+G+C) * 100 ]% #for DNA

- It is an important parameter that reflects the stability and structural properties of the nucleic acid, as G-C pairs form three hydrogen bonds (compared to two in A-T pairs), making the DNA more thermally stable.

- It also plays a role in gene expression, genome structure, and species evolution.



---



##Project Overview:
In this project, we analyze the GC content of DNA sequences.
- The program accepts input in two formats:

  1. Single Raw DNA sequence entered as text.
  2. FASTA file containing one or more DNA sequences.

- It breaks down key statistics like GC content and shows the results in easy-to-understand tables and graphs, making the data much easier to interpret and visualize.

- It also filters out any unknown nucleotides represented by 'N' from the DNA sequence before analysis.




---






###Important Note:
- For optimal results, it's recommended to use the "Run All" option to execute the entire notebook. This ensures that all interdependent cells are executed in the correct order, and the final output is displayed seamlessly at the end.

---


1. Starting Cell -  Install and import required packages


In [1]:
# Always Run this cell first to install required packages!
!pip install biopython

from Bio import SeqIO
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

#for UI
!pip install ipywidgets
import ipywidgets as widgets
from IPython.display import display, clear_output

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


2. Functions for Calculation of G-C content.

In [2]:
#For calculating GC content
def GC_content(sequence):
  sequence = sequence.upper()
  filter_sequence = sequence.replace("N","")
  GC_content = round((((sequence.count('G')+sequence.count('C'))/len(filter_sequence))*100) , 2)
  return GC_content


3. Function for Estimation of Melting Temperature .

- Note: The melting temperature is calculated using the Wallace rule, which is only reliable for sequences shorter than approximately 20 nucleotides.

- If a sequence exceeds this length, the program will display 'Not Valid' as the melting temperature.

In [3]:
#Melting temperature Estimation by Wallace Rule !
def Melting_temp(sequence):
  sequence = sequence.upper()

  A = sequence.count('A')
  T = sequence.count('T')
  G = sequence.count('G')
  C = sequence.count('C')

  Total = A+T+C+G

  if Total <= 20:
    Tm = ((2 * (A + T)) + (4 * (G+C)))

  else:
    Tm = "Not Valid"
  return Tm


4. Function for Analyzing DNA Sequence Input - In form of Text

In [4]:
#For Raw DNA Analyses - text input
def Raw_DNA_cmd(sequence):
    Correct_Error = sequence.strip().replace(" ", "").upper()
    return [{
        'ID': 'User_Input',
        'Total Length': len(Correct_Error),
        'Unknown Nucleotide (N)':Correct_Error.count("N") ,
        'GC Content (%)': GC_content(Correct_Error),
        'Melting Temp(°C)': Melting_temp(Correct_Error),
        'A': Correct_Error.count("A"),
        'T': Correct_Error.count("T"),
        'G': Correct_Error.count("G"),
        'C': Correct_Error.count("C")
    }]


5. Function for Analyzing DNA Sequence Input - In form of FASTA File

In [5]:
# For FASTA file input - fASTA file may contain single sequence or multiple sequence .
def FASTA_File_cmd(file_path):
    results = []
    for record in SeqIO.parse(file_path, "fasta"):
        seq = str(record.seq).strip().upper()
        results.append({
            'ID': record.id,
            'Total Length': len(seq),
            'Unknown Nucleotide(N)':seq.count("N") , #removing unknown nucleotide if present
            'GC Content (%)': GC_content(seq),
            'Melting Temp (°C)': Melting_temp(seq),
            'A': seq.count("A"),
            'T': seq.count("T"),
            'G': seq.count("G"),
            'C': seq.count("C")
        })
    return results

6. Function for Graphical Representation of Results.

In [6]:
#graphical representation
def plot_distribution(results):
    df = pd.DataFrame(results)

    if len(df) == 1:
        #For single sequence
        counts = df[['A', 'T', 'G', 'C']].iloc[0]
        print("  \nGraphical representation of individual nucleotide frequencies in the analyzed DNA sequence: \n  ")
        plt.figure(figsize=(6, 5))
        bars = plt.bar(counts.index, counts.values, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
        plt.title(f"Nucleotide Count for {df['ID'].iloc[0]}")
        plt.xlabel("Nucleotides")
        plt.ylabel("Count")
        plt.grid(axis='y', linestyle='--', alpha=0.5)
        for bar in bars:
            yval = bar.get_height()
            plt.text(bar.get_x() + 0.1, yval + 1, int(yval))
        plt.tight_layout()
        plt.show()

    else:
        #For Multiple sequences , Incase of FASTA File
        #A T G C  Composition Across Sequences
        bases = ['A', 'T', 'G', 'C']
        x = range(len(bases))
        width = 0.15

        print("  \nInter-sequence comparison of nucleotide frequencies presented graphically: \n  ")
        plt.figure(figsize=(10, 6))
        for idx, row in df.iterrows():
            offset = (idx - len(analysis_result)/2) * width
            plt.bar([i + offset for i in x],
                    row[bases],
                    width=width,
                    label=row['ID'])

        plt.xticks(x, bases)
        plt.xlabel("Nucleotides")
        plt.ylabel("Count")
        plt.title("Bar Plot Representing Nucleotide Counts for Each Sequence")
        plt.legend(title="Sequence ID")
        plt.grid(axis='y', linestyle='--', alpha=0.5)
        plt.tight_layout()
        plt.show()

        print(" \nBar graph representing the GC content of each DNA sequence included in the analysis:  \n  ")
            #GC Content Plot
        plt.figure(figsize=(10, 5))
        bars = plt.bar(df['ID'], df['GC Content (%)'], color='mediumseagreen')
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2, height + 1, f"{height:.2f}%",
                     ha='center', va='bottom', fontsize=9)
        plt.title("GC Content (%) of Sequences")
        plt.xlabel("Sequence ID")
        plt.ylabel("GC Content (%)")
        plt.ylim(0, 100)
        plt.grid(axis='y', linestyle='--', alpha=0.4)
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()


7. Function for Exporting the Results in form of CSV File.

In [7]:
# Export to CSV
def export_results(results):
    df = pd.DataFrame(results)
    df.to_csv("G-C Content Analysis.csv", index=False)
    files.download("G-C Content Analysis.csv")

8. Function For GUI Using ipywidgets.

In [8]:
#Global state variable
selected_input_type = None
uploaded_file = None
raw_sequence = None
analysis_result = []

#Buttons
fasta_btn = widgets.Button(description="FASTA Sequence", button_style='info')
raw_btn = widgets.Button(description="Raw DNA Sequence", button_style='info')
analyze_button = widgets.Button(description="Analyze", button_style='success')
download_button = widgets.Button(description="Download CSV", button_style='warning')

#Widgets
dna_input_widget = widgets.Textarea(
    placeholder='Enter raw DNA sequence here...only one DNA sequence',
    description='DNA Seq:',
    layout=widgets.Layout(width='100%', height='100px')
)
input_area = widgets.Output()
analysis_output = widgets.Output()

#Define Functions

def show_input_widget(input_type):
    global selected_input_type, uploaded_file
    selected_input_type = input_type
    input_area.clear_output()
    uploaded_file = None  # Reset

    with input_area:
        if input_type == 'fasta':
            upload = files.upload()
            uploaded_file = list(upload.keys())[0]
            print(f"File uploaded: {uploaded_file}")
        elif input_type == 'raw':
            display(dna_input_widget)

def analyze_sequences(b):
    global analysis_result, uploaded_file, raw_sequence

    analysis_output.clear_output()

    with analysis_output:
        if selected_input_type == 'fasta':
            if uploaded_file:
                analysis_result = FASTA_File_cmd(uploaded_file)
            else:
                print("No file uploaded.")
                return
        elif selected_input_type == 'raw':
            raw_sequence = dna_input_widget.value.strip()
            if raw_sequence:
                analysis_result = Raw_DNA_cmd(raw_sequence)
            else:
                print("No DNA sequence entered.")
                return
        else:
            print("Please select an input type.")
            return

        df = pd.DataFrame(analysis_result)
        display(df)
        plot_distribution(analysis_result)

def download_csv(b):
    if analysis_result:
        export_results(analysis_result)

#On Click functions
fasta_btn.on_click(lambda b: show_input_widget('fasta'))
raw_btn.on_click(lambda b: show_input_widget('raw'))
analyze_button.on_click(analyze_sequences)
download_button.on_click(download_csv)

#Display Everything !
print("\033[1;34mDNA G-C Content Analysis\033[0m")
display(widgets.HBox([fasta_btn, raw_btn]))
display(input_area)
display(analyze_button)
display(download_button)
display(analysis_output)

[1;34mDNA G-C Content Analysis[0m


HBox(children=(Button(button_style='info', description='FASTA Sequence', style=ButtonStyle()), Button(button_s…

Output()

Button(button_style='success', description='Analyze', style=ButtonStyle())



Output()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>