In [None]:
#@title ### **(LOGIN) Set up Hugging Face Token and Connect to Google Drive**

from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/drive')

#@markdown Enter your Hugging Face Access Token (Write)
token_name = 'Token Name' #@param {type:"string"}
token_value = 'Token Value' #@param {type:"string"}

import os
os.environ[token_name] = token_value

from huggingface_hub import HfApi
api = HfApi()

try:
    user_info = api.whoami()
    print(f"Successfully authenticated as: {user_info['name']}")
except Exception as e:
    print(f"Authentication failed: {str(e)}")

In [None]:
#@title Text Preprocess:
!pip install datasets pyspellchecker
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
import re
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
import nltk
import os
from huggingface_hub import HfApi, HfFolder

# Download necessary NLTK data
nltk.download('punkt', quiet=True)

# Enable tqdm for pandas
tqdm.pandas()

# Load the dataset from Hugging Face
dataset = load_dataset("iZELX1/Comsci-Concepts-5k")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset['train'])
df = df.sample(n=500, random_state=42)

# Advanced Preprocessing Function
def preprocess_text(text):
    # Convert contractions
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"'d", " would", text)
    text = re.sub(r"'ll", " will", text)
    text = re.sub(r"'t", " not", text)
    text = re.sub(r"'ve", " have", text)
    text = re.sub(r"'m", " am", text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove redundant whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Fix spacing around punctuation
    text = re.sub(r'\s([,.!?])', r'\1', text)

    return text

# Initialize SpellChecker
spell = SpellChecker()

# Apply preprocessing to input and output columns
df['input'] = df['input'].apply(preprocess_text)
df['output'] = df['output'].apply(preprocess_text)

# Function to correct spelling with progress bar
def correct_spelling(text):
    words = word_tokenize(text)
    corrected_words = []
    for word in tqdm_notebook(words, desc="Correcting spelling", leave=False):
        corrected_word = spell.correction(word)
        corrected_words.append(corrected_word if corrected_word is not None else word)
    return ' '.join(corrected_words)

# Apply spell checking with progress bar
print("Applying spell checking to input column...")
df['input'] = df['input'].progress_apply(correct_spelling)
print("Applying spell checking to output column...")
df['output'] = df['output'].progress_apply(correct_spelling)

# Display the first few rows of the preprocessed dataset
print(df.head())

# Save the preprocessed DataFrame to a new CSV file
output_file = "preprocessed_dataset.csv"
df.to_csv(output_file, index=False)
print(f"Preprocessed dataset saved to {output_file}")

# Function to upload dataset to Hugging Face
def upload_to_huggingface(file_path, repo_name, token):
    # Initialize Hugging Face API
    api = HfApi()

    # Login to Hugging Face
    HfFolder.save_token(token)

    # Create a new dataset from the CSV file
    dataset = Dataset.from_pandas(pd.read_csv(file_path))

    # Push the dataset to the Hugging Face Hub
    dataset.push_to_hub(repo_name, private=True)

    print(f"Dataset uploaded to Hugging Face: https://huggingface.co/datasets/{repo_name}")

# Ask user if they want to upload to Hugging Face
upload_choice = input("Do you want to upload the preprocessed dataset to Hugging Face? (yes/no): ").lower()

if upload_choice == 'yes':
    hf_token = input("Enter your Hugging Face API token: ")
    repo_name = input("Enter the desired repository name for your dataset: ")

    upload_to_huggingface(output_file, repo_name, hf_token)
else:
    print("Dataset not uploaded to Hugging Face.")

print("Process completed.")