In [None]:
import pandas as pd
import re

In [None]:
# File paths for the CSV files
file_paths = ["/content/AMT10_train.csv", "/content/AMT10_test.csv", "/content/AMT10_validation.csv"]

# List to hold DataFrames for each CSV file
dfs = []

# Read each CSV file and append its DataFrame to the list
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Drop all but the first column
combined_df = combined_df.iloc[:, :1]

combined_df.head(10)

# Extract numbers from each row
contest_ids = []
for index, row in combined_df.iterrows():
    number = row.iloc[0].split('/')[0]
    contest_ids.append(int(number))

# Get unique numbers by converting the list to a set
contest_ids = set(contest_ids)

# Convert the set back to a sorted list
contest_ids = sorted(contest_ids)

print(len(contest_ids))
print(contest_ids)

missing_contest_ids = [num for num in range(1, 1957) if num not in contest_ids]

print(len(missing_contest_ids))
print(missing_contest_ids)

In [None]:
# Function to extract the contest id from the problem link
def extract_number(link):
    match = re.search(r'/problem/(\d+)/', link)
    if match:
        return int(match.group(1))
    else:
        print("Contest ID not found in the link.")
        return None

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv("/content/codeforces_problems.csv")

# Apply the function to extract numbers from the problem_link column
df['contest_id'] = df['problem_link'].apply(extract_number)

# Filter the DataFrame based on whether the extracted numbers exist in the given list
filtered_df = df[df['contest_id'].isin(missing_contest_ids)]

print(len(filtered_df))
filtered_df.head(10)

In [None]:
columns_to_drop = ['id', 'time_limit', 'memory_limit', 'sample_input', 'sample_output', 'contest_id']
filtered_df = filtered_df.drop(columns=columns_to_drop)

filtered_df.rename(columns={'problem_text': 'description'}, inplace=True)
filtered_df.rename(columns={'difficulty': 'rating'}, inplace=True)

filtered_df['rating'] = filtered_df['rating'].astype(float)

# Assuming df is your DataFrame and 'problem_link' is the column containing the links
filtered_df['problem_link'] = filtered_df['problem_link'].str.replace('https://codeforces.com/problemset/problem/', '')
filtered_df.rename(columns={'problem_link': ''}, inplace=True)

filtered_df.head(10)

In [None]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# This is an interactive task.Rudolph is a scientist who studies alien life forms.
# -> This is an interactive task. Rudolph is a scientist who studies alien life forms.
def processing_dot_capitalize(sentences):
    new_sentences = ""
    prev = sentences[0]
    for t in sentences:
        if prev == '.' and t.isupper():
            new_sentences += ' '
        new_sentences += t
        prev = t
    return new_sentences

# ( $$$ 1 \\le t_{i, j} \\le 10^6 $$$ )
# -> ( $$$ 1 \\le t_{i, j} \\le 1000000 $$$ )
def replace_exponent_notation(text):
    # Function to replace exponent notation with numbers
    def replace_exponent(match):
        exponent = int(match.group(1))
        return str(10 ** exponent)

    # Replace "10^4" or "10^5" with the corresponding numbers
    replaced_text = re.sub(r'10\^(\d+)', replace_exponent, text)

    return replaced_text

# there is a room in front of rudolph with $$$n$$$ different objects scattered around.
# -> There is a room in front of Rudolph with $$$ n $$$ different objects scattered around.
def add_spacing_between_dollar_signs(text):
    # Regular expression pattern to add spacing between "$$$"
    pattern = r'(?<=\$\$\$)(?=\S)|(?<=\S)(?=\$\$\$)'

    # Add spacing between "$$$"
    spaced_text = re.sub(pattern, ' ', text)

    return spaced_text

# This is an interactive task.
# -> this is an interactive task.
def convert_to_lowercase(text):
    return text.lower()

def is_number(string):
    return string.isdigit()

# 2 \\cdot 100000
# -> 200000
def calculate_multiplication(text):
    t = text.split()  # Split the input text into a list of words
    new_text = []  # Initialize a new list for the modified text
    count = 0  # Initialize a count to keep track of processed elements

    # Iterate through the words in the input text
    for i in range(len(t)):
        if count > 0:
            count -= 1
            continue

        # Check if the current word and the word after it form a multiplication expression
        if not len(t) - i < 3 and is_number(t[i]) and is_number(t[i + 2]) and t[i + 1] == '\cdot':
            # Evaluate and append the result of the multiplication to the new text
            new_text.append(str(eval(t[i] + '*' + t[i + 2])))
            count = 2  # Skip the next two words as they have been processed in the multiplication
        else:
            new_text.append(t[i])  # Append the current word to the new text

    return ' '.join(new_text)  # Join the modified words to form the final text

In [None]:
def preprocessing(text):
    #text = remove_less_than_three_letters(text)
    text = processing_dot_capitalize(text) # Run before "lowercase"
    text = convert_to_lowercase(text)
    text = add_spacing_between_dollar_signs(text)
    text = replace_exponent_notation(text)
    text = calculate_multiplication(text) # optional
    return text

In [None]:
def split_sentences(sentences):
    return sent_tokenize(sentences)

def split_words(sentence):
    return word_tokenize(sentence)

In [None]:
def lemmatization(tokens):
    # Initialize the WordNet Lemmatizer
    lmtzr = WordNetLemmatizer()

    # Lemmatize each word in the list of tokens as verbs ('v' indicates verb lemmatization)
    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]

    return tokens

def remove_stopwords(tokens):
    filtered_words = []  # Initialize a list to store filtered words
    stopwords = nltk.corpus.stopwords.words('english')  # Get the list of English stopwords
    stopwords = [item for item in stopwords if len(item) > 1]  # Filter out single-letter stopwords

    for word in tokens:
        # If the individual word is not in the stopwords list, add it to the filtered_words list
        if word not in stopwords:
            filtered_words.append(word)

    return filtered_words


In [None]:
def get_preprocessing_sentence(tokens):
    filtered_words = remove_stopwords(tokens)
    filtered_words = lemmatization(filtered_words)
    return ' '.join(filtered_words).replace('$ $ $', '$$$')

In [None]:
def get_preprocessed_sentence(sentences):
    new_sentences = []

    sentences = preprocessing(sentences)
    sentences_list = split_sentences(sentences)

    for sentence in sentences_list:
        tokens = split_words(sentence)
        preprocessed_sentence = get_preprocessing_sentence(tokens)
        if preprocessed_sentence[-1] == '.':
            preprocessed_sentence = preprocessed_sentence[:-2]
        new_sentences.append(preprocessed_sentence.replace(' , ', ' '))
    return new_sentences

In [None]:
filtered_df = filtered_df.dropna(subset=['description'])

In [None]:
from tqdm import tqdm

new_description = []
for description in tqdm(filtered_df['description'].values):
    new_description.append(get_preprocessed_sentence(description))

In [None]:
filtered_df['description'] = new_description

In [None]:
# Function to convert a list to a string
def list_to_string(lst):
    return ' '.join(lst)

In [None]:
filtered_df['description'] = filtered_df['description'].apply(list_to_string)
filtered_df.head()

In [None]:
def dollar_processing(arr):
    # Initialize a new array
    new_arr = []

    # Replace consecutive '$' with '$$$'
    i = 0
    while i < len(arr):
        if i + 2 < len(arr) and arr[i] == '$' and arr[i + 1] == '$' and arr[i + 2] == '$':
            new_arr.append('$$$')
            i += 3  # Process three '$' and increase the index by 3
        else:
            new_arr.append(arr[i])
            i += 1
    return new_arr

In [None]:
from collections import Counter, defaultdict
from torchtext.data.utils import get_tokenizer

def tokenizing_sentences(sentences):
    # Initialize tokenizer
    tokenizer = get_tokenizer("spacy")

    new_sentences = []
    for sentence in sentences:
        tokens = tokenizer(sentence)
        new_sentences.append(dollar_processing(tokens))

    return new_sentences

In [None]:
filtered_df['description'] = tokenizing_sentences(filtered_df['description'])

In [None]:
filtered_df['description'] = filtered_df['description'].apply(list_to_string)
filtered_df.head()

In [None]:
filtered_df.to_csv('finetuning_data.csv')