<h1>Text Normalization and Entity Verification for GPT Fine-Tuning</h1>

In [None]:
import spacy
from nltk.corpus import stopwords
import re

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]

    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    clean_text = ' '.join(filtered_tokens)

    return clean_text, entities

text = "Apple is looking at buying U.K. startup for $1 billion"
processed_text, detected_entities = preprocess_text(text)
print("Processed Text:", processed_text)
print("Detected Entities:", detected_entities)

<h1>Validation of Training Data Relevance</h1>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

documents = [
    "How to reset my router?",
    "Can you help me upgrade my plan?",
    "Troubleshooting network issues",
    "What are the latest offers?",
    "Billing question regarding overcharges"
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

new_doc = ["Network speed is slow, how can I improve it?"]
new_doc_vector = vectorizer.transform(new_doc)
cosine_similarities = cosine_similarity(new_doc_vector, tfidf_matrix)
print("Cosine Similarities:\n", cosine_similarities)

<h1>Data Cleansing and Anomaly Detection</h1>

In [None]:
import pandas as pd
import numpy as np

data = {'Time': ["2021-06-01 12:01:01", "2021-06-01 12:05:30", "2021-06-01 12:30:05", "outlier", "2021-06-01 12:45:10"],
        'Interaction_length': [300, 180, 450, 9999, 230]}
df = pd.DataFrame(data)

df['Time'] = pd.to_datetime(df['Time'], errors='coerce')
df = df.dropna(subset=['Time'])

z_scores = np.abs((df['Interaction_length'] - df['Interaction_length'].mean()) / df['Interaction_length'].std())
df = df[z_scores < 3]

print("Cleaned Data:\n", df)

<h1>Structuring for GPT-3.5: Role-Based Conversation Data</h1>

In [None]:
conversation = [
    {"role": "system", "content": "You are an investment advice assistant."},
    {"role": "user", "content": "Is it a good time to invest in stocks?"},
    {"role": "assistant", "content": "It depends on the market conditions and your personal financial goals. It's often wise to consult with a financial advisor."}
]

# Save to a JSON file with proper formatting
with open('formatted_data.json', 'w') as file:
    json.dump({"messages": conversation}, file, indent=4)

<h1>Splitting Data: Training and Validation Sets</h1>

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_val, y_train, y_val = train_test_split(
    data, labels, test_size=0.2, stratify=labels, random_state=42
)

<h1>Serialization to JSONL for Training</h1>

In [None]:
import json

def serialize_to_jsonl(data_entries):
    with open('output.jsonl', 'w') as file:
        for entry in data_entries:
            json_object = json.dumps({
                'prompt': entry['question'],
                'completion': entry['answer']
            })
            file.write(json_object + '\n')

# Example entries
data_entries = [
    {'question': 'What is the capital of France?', 'answer': 'Paris'},
    {'question': 'What is the largest planet in our solar system?', 'answer': 'Jupiter'}
]
serialize_to_jsonl(data_entries)

<h1>Uploading Data for Fine-Tuning with OpenAI</h1>

In [None]:
from openai import OpenAI

client = OpenAI(api_key="your api key")

# Assuming you've already installed the OpenAI library and set your API key
response = client.files.create(
    file=open("training_data.jsonl", "rb"),
    purpose='fine-tune'
)

print("Uploaded file ID:", response['id'])