In [6]:
pip install pyspellchecker

Collecting pyspellchecker
  Downloading pyspellchecker-0.7.2-py3-none-any.whl (3.4 MB)
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2
Note: you may need to restart the kernel to use updated packages.


###### Counting the total number of all errors including undesirable Typos, QnAs and truncated QnAs

In [18]:
import pandas as pd
import os
import csv
import re
import language_tool_python
import time
import cProfile

def count_multiple_commas(text):
    pattern = re.compile(r',{2,}')
    return len(re.findall(pattern, text))

def replace_multiple_commas(text):
    pattern = re.compile(r',{2,}')
    corrected_text = re.sub(pattern, ', ', text)
    if corrected_text != text:
        print(f"\nOriginal text: {text}")
        print(f"Corrected text: {corrected_text}")
    return corrected_text

def comma_cluster_removal_df(df):
    total_comma_count = 0

    for column in df.columns:
        df[column] = df[column].apply(replace_multiple_commas)
        total_comma_count += df[column].apply(count_multiple_commas).sum()

    print(f"Total comma count: {total_comma_count}")
    return df, total_comma_count


def is_undesirable_question_to_count(question):
    if isinstance(question, str):
        count_phrases = [
            
            "what is the title",
            "what is the research topic",
            "what data is used in the study",
            "what data sets are collected in this study",
            "how did the study",
            "what does the arrow in figure 6 represent",
            "what was the publication date of the study",
            "what is the title of the paper",
            "what is the purpose of this study",
            "what was the goal of the study",
            "what data was utilized in the study",
            "what is the source of funding for this study",
            "what was the aim of this study",
            "what are the objectives of the study",
            "what was the main focus of the study",
            "what were the main conclusions of the study",
            "what methods were used in the study",
            "what ratio was used for the analysis",
            "what models are shown in fig 4",
            "what is the conclusion of this study",
            "what do the innovations of this study enable",
            "what is the article about",
            "what is the doi number for the article",
            "where can the tool be accessed",
            "what data has been used",
            "what were the results of the study",
            "what are the key findings of this study",
            "what data sources were used in this study",
            "what are the limitations of this study",
            "where was the research conducted",
            "what are the key words for this article",
            "what is the main objective of this study",
            "what evidence supports the research",
            # Add more phrases here
        ]
        return any(phrase in question.lower() for phrase in count_phrases)
    return False

def is_truncated(sentence):
    
    # Ensure the sentence is converted to a string
    sentence = str(sentence)
    # Define a list of sentence-ending punctuation marks
    sentence_endings = ['.', '!', '?', '."', '!"', '?"', '.”', '!”', '?”']
    
    # Check if the last character of the sentence is a sentence-ending punctuation mark
    if sentence[-1] in sentence_endings:
        return False  # Not truncated
    else:
        #print ("\n" + sentence+"\n")
        return True   # Truncated

# Assuming you have defined your functions is_undesirable_question_to_count and is_truncated properly

def count_truncated_questions_and_answers_in_df(df, filtered_data_file):
    #df = pd.read_csv(file_path)
    df.info() 
    columns_with_spaces = df.columns.tolist()
    print(columns_with_spaces) 
    questions = df['Question']
    answers = df['Answer']
        
    questions_count = df['Question'].apply(is_undesirable_question_to_count).sum()

    # Count truncated questions
    truncated_questions_count = df['Question'].apply(is_truncated).sum()

    # Count truncated answers
    truncated_answers_count = df['Answer'].apply(is_truncated).sum()
     
    # Filter out truncated rows
    truncated_questions = []
    truncated_answers = []
    
    # Filter out truncated rows
    not_truncated_indices = []
    for i in range(len(df)):
        if not (is_truncated(questions[i]) or is_truncated(answers[i])):
            not_truncated_indices.append(i)
        else:
            if is_truncated(questions[i]):
                truncated_questions.append(questions[i])
                print(f"Truncated Question {i}: {questions[i]}")
            if is_truncated(answers[i]):
                truncated_answers.append(answers[i])
                print(f"Corresponding Question {i}: {questions[i]}")
                print(f"Truncated Answer  {i}: {answers[i]} \n")
    df = df.iloc[not_truncated_indices]
      
    # Filter out questions and their corresponding answers
    filtered_indices = [i for i, question in enumerate(df['Question']) if is_undesirable_question_to_count(question)]
    filtered_data = pd.DataFrame({
        'Question': df['Question'].iloc[filtered_indices],
        'Answer': df['Answer'].iloc[filtered_indices]
    })
    
    # Print the number of rows in the filtered data
    print("Number of rows in filtered data:", len(filtered_data))
    
    # Create a new DataFrame for remaining data without truncated QnA
    remaining_indices = [i for i in range(len(df)) if i not in filtered_indices]
    remaining_data = df.iloc[remaining_indices]
    
    # Print the number of rows in the remaining data
    print("Number of rows in remaining data:", len(remaining_data))
    
    # Save the filtered data
    filtered_data.to_csv(filtered_data_file, index=False, encoding='utf-8')
    
    # Save the remaining data
    #remaining_data.to_csv(remaining_data_file, index=False, encoding='utf-8')
    
    return questions_count, truncated_questions_count, truncated_answers_count, remaining_data







def save_filtered_data(file_path, filtered_data):
    with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Question', 'Answer']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(filtered_data)

        


#def count_errors(text):
 #   tool = language_tool_python.LanguageTool('en-GB')
  #  matches = tool.check(text)
   # return len(matches)

def LanguageTool_df(df):
    tool = language_tool_python.LanguageTool('en-GB')
    corrected_content = []

    original_error_count = 0
    corrected_error_count = 0

    for i in range(len(df)):
        corrected_row = [tool.correct(cell) for cell in df.iloc[i]]
        corrected_content.append(corrected_row)

        for j in range(len(corrected_content[i])):
           # original_error_count += count_errors(df.iloc[i, j])
          #  corrected_error_count += count_errors(corrected_content[i][j])
            corrected_content[i][j] = corrected_content[i][j].replace(' Answer', 'Answer')
            #print (corrected_content[i][j])

    LanguageTool_corrected_df = pd.DataFrame(corrected_content, columns=df.columns)

    return LanguageTool_corrected_df #, original_error_count, corrected_error_count


def add_space_before_opening_bracket(df):
    #df = pd.read_csv(input_file_path)
    
    for column in df.columns:
        df[column] = df[column].apply(lambda cell: re.sub(r'([A-Za-z])\(', r'\1 (', cell))
    
    #df.to_csv(ammended_space_file, index=False)
    
    return df   # Return the output file path after processing





def main():
    # List of CSV files to process
    start_time = time.time() 
    
    
    
    csv_files = [
       # "C:/Users/HP/Documents/Sam/Data Science Voluteer/CSV/Geothermal_energy_wellcome_20230717.csv",
        #"C:/Users/Joshua Giwa/Downloads/QnAs_generated_27_07_2023/Solar power_wellcome_20230717.csv",
        "C:/Users/Joshua Giwa/Downloads/QnAs_generated_27_07_2023/Solar power_gatesopen_20230717.csv",
        #"C:/Users/Joshua Giwa/Downloads/QnAs_generated_27_07_2023/Geothermal+energy_f1000_20230717.csv",
       # "C:/Users/Joshua Giwa/Downloads/QnAs_generated_27_07_2023/Geothermal_energy_wellcome_20230717.csv",
       # "C:/Users/Joshua Giwa/Downloads/QnAs_generated_27_07_2023/Carbon+footprint_wellcome_20230717.csv",
       # "C:/Users/Joshua Giwa/Downloads/test_dataset.csv"
        ## Add more file paths here as needed
    ]

    total_questions_count = 0
    total_truncated_questions_count = 0
    total_truncated_answers_count = 0
    

    for csv_file in csv_files: 
        df = pd.read_csv(csv_file)
       # corrected_file_path = os.path.join(os.path.expanduser("~"), "Downloads", os.path.basename(csv_file).replace('.csv', '_corrected.csv'))
        #remaining_data_file = os.path.join(os.path.expanduser("~"), "Downloads", os.path.basename(csv_file).replace('.csv', '_remaining_data.csv'))
        filtered_data_file = os.path.join(os.path.expanduser("~"), "Downloads", os.path.basename(csv_file).replace('.csv', '_filtered_questions.csv'))
        #ammended_space_file = os.path.join(os.path.expanduser("~"), "Downloads", os.path.basename(csv_file).replace('.csv', 'bracket_spaced.csv'))
        questions_count, truncated_questions, truncated_answers, remaining_data = count_truncated_questions_and_answers_in_df(df, filtered_data_file)
        comma_cluster_removed_df, total_comma_count = comma_cluster_removal_df(remaining_data)
        space_before_bracket_ammended_df = add_space_before_opening_bracket(comma_cluster_removed_df)
        LanguageTool_corrected_df = LanguageTool_df(space_before_bracket_ammended_df)
       
        
        # Save the processed DataFrame with "updated" added to the name
        cleaned_df = LanguageTool_corrected_df.copy()
        cleaned_df_filename = os.path.basename(csv_file).replace('.csv', '_cleaned.csv')

        # Assuming that you have a directory where you want to save the updated DataFrames
        cleaned_df_dir = os.path.join(os.path.expanduser("~"), "Downloads", "cleaned_QnAs")

        if not os.path.exists(cleaned_df_dir):
            os.makedirs(cleaned_df_dir)

        cleaned_df_file_path = os.path.join(cleaned_df_dir, cleaned_df_filename)

        # Save the updated DataFrame as a CSV file
        cleaned_df.to_csv(cleaned_df_file_path, index=False)

        # Now, you can use the updated DataFrame for further processing if needed

        
        
        
        total_questions_count += questions_count
        total_truncated_questions_count += truncated_questions
        total_truncated_answers_count += truncated_answers
        
        
        print(f"File: {csv_file}\n")
        print(f"Total undesirable questions: {questions_count}")
        print(f"Total Truncated Questions: {truncated_questions}")
        print(f"Total Truncated Answers: {truncated_answers}")
        print(f"Original typo/error found: {original_error_count}")
        print(f"excessive comma occurence: {total_comma_count}")
        print(f"Corrected typo/error count: {corrected_errors}")

        
        print(f"(undesirable_questions + Truncated Questions + Truncated Answers + excessive comma occurence + original errer count): {questions_count + truncated_questions + truncated_answers +  total_comma_count + original_error_count}\n")

  #  print(f"Total Questions Count: {total_questions_count}")
  #  print(f"Total Truncated Questions Count: {total_truncated_questions_count}")
  #  print(f"Total Truncated Answers Count: {total_truncated_answers_count}")
  #  print(f"Total (Questions + Truncated Questions + Truncated Answers): {total_questions_count + total_truncated_questions_count + total_truncated_answers_count}")

    end_time = time.time()
    
    elapsed_time_seconds = end_time - start_time
    elapsed_time_minutes = elapsed_time_seconds / 60

    print(f"Script ran for {elapsed_time_minutes:.2f} minutes.")
    
    
    
if __name__ == "__main__":
    cProfile.run("main()", sort='cumulative')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  6 non-null      object
 1   Answer    6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes
['Question', 'Answer']
Corresponding Question 1: What are the limitations of this study?
Truncated Answer  1: The limitations of this study avaliable include the exclusion of Love Matters social media platforms and offline components, as well as the dependency on user cookie settings and limitations in recognizing source traffic 

Corresponding Question 3: What is the aim of new vaccine development?
Truncated Answer  3: The aim of new,,,, vaccine development is to enhance maternal immunization, e.g. for group B streptococcus and respiratory syncytial virus, and to reduce serious morbidity and mortality in 

Number of rows in filtered data: 2
Number of rows in remaining data: 2

Original text: R

###### Deleting the undesirable Questions and thier corresponding answers, and truncated QnAs

In [None]:
import pandas as pd

def is_question_to_count(question):
    if isinstance(question, str):
        count_phrases = [
            
            "what is the title",
            "what is the research topic",
            "what data is used in the study",
            "what data sets are collected in this study",
            "how did the study",
            "what does the arrow in figure 6 represent",
            "what was the publication date of the study",
            "what is the title of the paper",
            "what is the purpose of this study",
            "what was the goal of the study",
            "what data was utilized in the study",
            "what is the source of funding for this study",
            "what was the aim of this study",
            "what are the objectives of the study",
            "what was the main focus of the study",
            "what were the main conclusions of the study",
            "what methods were used in the study",
            "what ratio was used for the analysis",
            "what models are shown in fig 4",
            "what is the conclusion of this study",
            "what do the innovations of this study enable",
            "what is the article about",
            "what is the doi number for the article",
            "where can the tool be accessed",
            "what data has been used",
            "what were the results of the study",
            "what are the key findings of this study",
            "what data sources were used in this study",
            "what are the limitations of this study",
            "where was the research conducted",
            "what are the key words for this article",
            "what is the main objective of this study",
            "what evidence supports the research",
            # Add more phrases here
        ]
        return any(phrase in question.lower() for phrase in count_phrases)
    return False


def is_truncated(text):
    if isinstance(text, str):
        return text.strip().endswith("...")
    return False

def is_typo(text):
    if isinstance(text, str):
        # Here, you can define a list of common words that might indicate typos
        # Customize this list according to your needs
        common_typos = [
            "odds", "noncurrent", "(mt)", "thermocyling", "o2geosocial", "Outbreaker2", "outbreaker()", "avaliable", "cleanup",
            "funannotate", "gallic", "nonessential", "Total", "runoff", "„",
            # Add more common typos here
        ]
        return any(typo in text.lower() for typo in common_typos)
    return False

def filter_undesirable_rows(df):
    # Filter rows based on whether the question is undesirable or truncated
    is_undesirable_question = df['Question'].apply(is_question_to_count)
    is_truncated_question = df['Question'].apply(is_truncated)
    is_truncated_answer = df['Answer'].apply(is_truncated)
    return df[~is_undesirable_question & ~is_truncated_question or ~is_truncated_answer]

def count_typos_in_csv(file_path, column_name):
    df = pd.read_csv(file_path)
    # Filter out undesirable and truncated rows before counting typos
    df_filtered = filter_undesirable_rows(df)
    typo_count = df_filtered[column_name].apply(is_typo).sum()
    return typo_count

def count_questions_in_csv(file_path):
    df = pd.read_csv(file_path)
    #df.info()
    questions_count = df['Question'].apply(is_question_to_count).sum()
    return questions_count

def count_truncated_questions_and_answers_in_csv(file_path):
    df = pd.read_csv(file_path)
    df.info()
    questions_count = df['Question'].apply(is_question_to_count).sum()

    # Count truncated questions
    truncated_questions_count = df['Question'].apply(is_truncated).sum()

    # Count truncated answers
    truncated_answers_count = df['Answer'].apply(is_truncated).sum()

    return questions_count, truncated_questions_count, truncated_answers_count


def main():
    # List of CSV files to process
    csv_files = [
        "C:/Users/Joshua Giwa/Downloads/QnAs_generated_31_07_2023/QnA_Organic farming_f1000_20230730.csv",
        # Add more file paths here as needed
    ]

    total_typos_count = 0

    for csv_file in csv_files:
        typo_count = count_typos_in_csv(csv_file, 'Question') + count_typos_in_csv(csv_file, 'Answer')
        total_typos_count += typo_count

        print(f"File: {csv_file}")
        print(f"Total Typos: {typo_count}\n")

        # Load the CSV file again to get the original data
        df = pd.read_csv(csv_file)

        # Filter out undesirable and truncated rows
        df_filtered = filter_undesirable_rows(df)

        # Save the cleaned DataFrame back to the CSV file
        df_filtered.to_csv(csv_file, index=False)

    print(f"Total Typos Count: {total_typos_count}")

if __name__ == "__main__":
    main()
