In [None]:

pip install spacy
pip install textblob
pip install autocorrect
pip install pyspellchecker
pip install spellchecker



# spellcheck_with five libraries for both Q and As

In [27]:
import pandas as pd
import spacy
from autocorrect import Speller
from textblob import TextBlob
from autocorrect import Speller
from spellchecker import SpellChecker
from spellchecker.spellchecker import SpellChecker as SpellChecker2



def spellcheck_with_spacy(df):
    nlp = spacy.load("en_core_web_sm")
    spell = Speller(lang='en')

    def clean_and_spellcheck(text):
        doc = nlp(str(text))  # Convert to string to handle non-string values
        corrected_words = [spell(token.lemma_) for token in doc]
        return ' '.join(corrected_words)

    df['Question'] = df['Question'].apply(clean_and_spellcheck)
    df['Answer'] = df['Answer'].apply(clean_and_spellcheck)
    return df

def spellcheck_with_textblob(df):
    def clean_and_spellcheck(text):
        blob = TextBlob(text)
        corrected_text = str(blob.correct())
        return corrected_text

    df['Question'] = df['Question'].apply(clean_and_spellcheck)
    df['Answer'] = df['Answer'].apply(clean_and_spellcheck)
    return df

def spellcheck_with_autocorrect(df):
    spell = Speller(lang='en')

    def clean_and_spellcheck(text):
        corrected_text = spell(text)
        return corrected_text

    df['Question'] = df['Question'].astype(str).apply(clean_and_spellcheck)
    df['Answer'] = df['Answer'].astype(str).apply(clean_and_spellcheck)
    return df

def spellcheck_with_pyspellchecker(df):
    spell = SpellChecker()

    def clean_and_spellcheck(text):
        words = text.split()
        corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
        return ' '.join(corrected_words)

    df['Question'] = df['Question'].astype(str).apply(clean_and_spellcheck)
    df['Answer'] = df['Answer'].astype(str).apply(clean_and_spellcheck)
    return df

def spellcheck_with_spellchecker(df):
    spell = SpellChecker()

    def clean_and_spellcheck(text):
        words = text.split()
        corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
        return ' '.join(corrected_words)

    df['Question'] = df['Question'].astype(str).apply(clean_and_spellcheck)
    df['Answer'] = df['Answer'].astype(str).apply(clean_and_spellcheck)
    return df

def main():
    # Load 
    df = pd.read_csv('db8606_20.csv')
 
   
     # Spell-check using Spacy for lemmatization and autocorrection
    df = spellcheck_with_spacy(df)

    # Spell-check using TextBlob
    df = spellcheck_with_textblob(df)

    # Spell-check using Autocorrect
    df = spellcheck_with_autocorrect(df)

    # Spell-check using Pyspellchecker
    df = spellcheck_with_pyspellchecker(df)

    # Spell-check using Spellchecker
    df = spellcheck_with_spellchecker(df)

    # Save the final cleaned dataset to a CSV file
    df.to_csv('final_cleaned_dataset.csv', index=False)

if __name__ == "__main__":
    main()


# add_full_stops

In [28]:
import pandas as pd

def add_full_stops(df):
    def check_full_stop(text):
        if pd.notnull(text) and isinstance(text, str) and text.strip() and text.strip()[-1] != '.':
            return text.strip() + '.'
        else:
            return text

    # Filter and print the first two rows without full stops
    rows_without_full_stops = df[df['Answer'].apply(lambda x: pd.notnull(x) and isinstance(x, str) and x.strip() and x.strip()[-1] != '.')].head(2)
    print("First two rows without full stops at the end of Answer column:")
    print(rows_without_full_stops)

    # Apply the function to add full stops
    df['Answer'] = df['Answer'].apply(check_full_stop)

    # Find and print the first two rows after adding full stops
    rows_with_full_stops = df.head(2)
    print("\nFirst two rows with full stops added at the end of Answer column:")
    print(rows_with_full_stops)

    return df

# Load dataset into a DataFrame
df = pd.read_csv('final_cleaned_dataset.csv')

# Apply the function to add full stops and print rows as described
df = add_full_stops(df)

# Save the modified dataset to a new CSV file
df.to_csv('dataset_with_full_stops.csv', index=False)


First two rows without full stops at the end of Answer column:
    id                                           Question  \
0  180  how do China and India lead in greeting of the...   
1  181  what be the main highlight of deutsche et al i...   

                                              Answer           topic_name  \
0            Hi et al 2019 find that China and India      Climate Impacts   
1  deutsche et al is 2008 study explore the impac...  Researching Ecology   

                                         topic_words      Riya  \n  \
0  climatological, climatic, climatology, climato...  analysis NaN   
1  researching, research, researched, researches,...  analysis NaN   

                         Lola     Suman   Narunja  Simon           Sapna  
0  incomplete answer provided  analysis  analysis    NaN  Incomplete Q&A  
1                    analysis  analysis  analysis    NaN        strategy  

First two rows with full stops added at the end of Answer column:
    id            

# replace multiple commas

In [29]:
import pandas as pd
import re

def replace_multiple_commas(df):
    def replace_commas(text):
        if isinstance(text, str):  # Check if the value is a string
            # Replace multiple commas (with or without spaces) with a single comma
            return re.sub(r',\s*,', ', ', text)
        return text  # Return non-string values as they are

    df['Answer'] = df['Answer'].apply(replace_commas)
    return df

# Load dataset into a DataFrame
df = pd.read_csv('dataset_with_full_stops.csv')

# Apply the function to replace multiple commas in the 'Answer' column
df = replace_multiple_commas(df)

# Save the modified dataset to a new CSV file
df.to_csv('dataset_with_single_commas.csv', index=False)



# Capitalise first letter of sentences

In [30]:
import pandas as pd

def correct_capitalization(df):
    def capitalize_first_letter(text):
        if isinstance(text, str):  # Check if the value is a string
            # Check if the first character is lowercase
            if text and text[0].islower():
                return text[0].upper() + text[1:]  # Capitalize the first letter
        return text  # Return non-string values as they are

    df['Answer'] = df['Answer'].apply(capitalize_first_letter)
    return df

# Load dataset into a DataFrame
df = pd.read_csv('dataset_with_single_commas.csv')

# Apply the function to correct capitalization in the 'Answer' column
df = correct_capitalization(df)

# Save the modified dataset to a new CSV file
df.to_csv('dataset_corrected_capitalization.csv', index=False)
