# Data Preprocessing

Import data from the GLUE cola dataset, shuffle, and remove the undesirable sentences (those which are labelled to be ungrammatical).

In [17]:
import pandas as pd
from sklearn.utils import shuffle

# Read the first 100 lines of cola.tsv into a DataFrame
df = pd.read_csv('cola.tsv', sep='\t')

# Shuffle the DataFrame
df = df.head(1000)

df.columns = ['model', 'label', 'blank', 'text']

new_df = df[['text', 'label']].copy()

print("Number of entries:", len(new_df))

filtered_df = new_df[new_df['label'] == 1].copy()

filtered_df.head()
print("Number of grammatical entries:", len(filtered_df))


Number of entries: 1000
Number of grammatical entries: 645


# Embeddings Generation – OS Model

In [18]:
import os
import openai
with open("openaiapikey.txt", "r") as file:
    file_contents = file.read()
openai.api_key = file_contents

In [19]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

filtered_df['ada_embedding'] = filtered_df.text.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
filtered_df.to_csv('OAembeddings.csv', index=False)
