In [1]:
import pandas as pd
import numpy as np

In [4]:
# Step 2: Load the dataset
# Step 2: Load the dataset with a specified encoding
data = pd.read_csv('biology_summarization_example.csv', encoding='ISO-8859-1')

In [6]:
# Step 3: Inspect the dataset
print(data.head())  # Display first few rows


                                           Long Text  \
0  \nIssues pertaining to Biology. Understanding ...   
1  \nIn accordance with different criteria we can...   
2  Physical and chemical properties of water impo...   
3  Carbohydrates\nMost abundant group of organic ...   
4  Lipids\nDiverse group of hydrophobic molecule...   

                                             Summary  \
0  Biological diversity refers to the variety of ...   
1  Living organisms show vast diversity in size, ...   
2  Water is an essential molecule for life due to...   
3  Carbohydrates are the most abundant organic co...   
4  Lipids are a diverse group of hydrophobic mole...   

                                            Keywords Unnamed: 3  
0  Biological diversity,  Species, Genes, Ecosyst...        NaN  
1  Diversity, Size, Shape, Form, Habitat, Metabol...        NaN  
2  Water molecule, Polarity, Hydrogen bonds, Cohe...        NaN  
3  Carbohydrates, Monosaccharides, Disaccharides,...        Na

In [None]:
print(data.info())  # Check data info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Long Text   499 non-null    object
 1   Summary     499 non-null    object
 2   Keywords    498 non-null    object
 3   Unnamed: 3  1 non-null      object
dtypes: object(4)
memory usage: 15.7+ KB
None


In [None]:
print(data.describe()) 

                                                Long Text  \
count                                                 499   
unique                                                497   
top     The Interior Uses of Plants\nBACKGROUND AND ST...   
freq                                                    2   

                                                  Summary  \
count                                                 499   
unique                                                497   
top     Soil nutrients are critical for plant growth, ...   
freq                                                    2   

                                                 Keywords  \
count                                                 498   
unique                                                497   
top     soil nutrients, macronutrients, micronutrients...   
freq                                                    2   

                                               Unnamed: 3  
count                 

In [None]:
#printing Data Columns
print(data.columns) 

Index(['Long Text', 'Summary', 'Keywords', 'Unnamed: 3'], dtype='object')


In [None]:
#Drop the unnamed unnecessary column
data = data.drop(columns=['Unnamed: 3'])

In [None]:
#printing Data Columns
print(data.columns)

Index(['Long Text', 'Summary', 'Keywords'], dtype='object')


In [None]:
#Check for null values
data.isnull().sum()

Long Text    0
Summary      0
Keywords     1
dtype: int64

In [None]:
#Removing unwanted characters, space, special characters
import re

# Function to standardize text: convert to lowercase, remove irrelevant symbols
def standardize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove irrelevant symbols (e.g., unusual bullet points, etc.)
    text = re.sub(r'[¦]', '', text)
    # Remove any remaining special characters (e.g., non-alphanumeric symbols)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the function to standardize the 'Long Text' and 'Summary' columns
data['Long Text'] = data['Long Text'].apply(standardize_text)
data['Summary'] = data['Summary'].apply(standardize_text)

# Display a sample to verify the standardization process
data[['Long Text', 'Summary']].head()

Unnamed: 0,Long Text,Summary
0,issues pertaining to biology understanding bio...,biological diversity refers to the variety of ...
1,in accordance with different criteria we can s...,living organisms show vast diversity in size s...
2,physical and chemical properties of water impo...,water is an essential molecule for life due to...
3,carbohydrates most abundant group of organic c...,carbohydrates are the most abundant organic co...
4,lipids diverse group of hydrophobic molecules ...,lipids are a diverse group of hydrophobic mole...


In [18]:
# Checking and removing duplicates based on specific columns 'Long Text' and 'Summary' only, which are the main content columns
data_cleaned = data.drop_duplicates(subset=['Long Text', 'Summary'])

# Display the number of rows before and after removing duplicates
original_count = data.shape[0]
cleaned_count = data_cleaned.shape[0]

original_count, cleaned_count


(499, 499)

In [19]:
# Checking for missing values in the essential columns 'Long Text' and 'Summary'
missing_long_text = data['Long Text'].isnull().sum()
missing_summary = data['Summary'].isnull().sum()

# Displaying the count of missing values in each essential column
missing_long_text, missing_summary


(0, 0)

In [None]:
# Saving the cleaned dataset to a new CSV file named 'bio_summary.csv'
output_path = 'D:/Downloads/RP/Summarization/Summary_Description/bio_summary.csv'
data_cleaned.to_csv(output_path, index=False)

output_path


'C:/Users/dinon/Desktop/Summary/bio_summary.csv'

In [None]:
# Selecting only 'Long Text' and 'Summary' columns and renaming the headers to lowercase
data_final = data_cleaned[['Long Text', 'Summary']].rename(columns={'Long Text': 'long text', 'Summary': 'summary'})

# Saving this final cleaned dataset to a new CSV file
output_path_final = 'D:/Downloads/RP/Summarization/Summary_Description/bio_summary_final.csv'
data_final.to_csv(output_path_final, index=False)

output_path_final


'C:/Users/dinon/Desktop/Summary/bio_summary_final.csv'