### Working with Files

#### Reading and Writing Text Files

In [None]:
# Reading files:
with open("example.txt", "r") as file:
    content = file.read()
    print(content)

In [None]:
# Writing Files
with open("output.txt", "w") as file:
  file.write("This is a sample output.")

In [None]:
# Reading files line-by-line
with open("example.txt", "r") as file:
  for line in file:
    print(line.strip()) #Removes extra newline characters

#### Working with CSV Files 

In [None]:
# Using the csv library to process structured text.
import csv

with open("Lagos Precipitation Truncated.csv", "r") as file:
  reader = csv.reader(file)
  for row in reader:
    print(row)

#### Working with JSON Files:

In [None]:
# Printing from json files
import json
with open("example.json", "r") as file:
  data = json.load(file)
  print(data)

### Text Preprocessing for Prompt Engineering


#### 1. Tokenization

In [None]:
# Tokenization Example
text = "AI is transformimg the world"
words = text.split()
print(words) # Output: ['AI', 'is', 'transforming', 'the', 'world.']

#### 2. Text Normalization:


Converting text to lowercase

In [None]:
# Text Normalization
text = "This is Prompt Engineering."
print(text.lower()) # Output: this is prompt engineering.

In [None]:
# Removing Punctuation
import string

text = "Hello, World!"
clean_text = text.translate(str.maketrans("", "", string.punctuation))  # removes any of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

print(clean_text)   # Output: Hello World

#### 3. Stemming and Lemmatization

In [None]:
import nltk

In [None]:
nltk.download('wordnet')

In [None]:
# Using nltk.stem for stemming and lemmatization
from nltk.stem import PorterStemmer

stemmer = porterStemmer()
words = ["running", "runner", "ran", "documented", "decorating"]
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)    # Output: ['run', 'runner', 'ran']

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "runner", "ran", "documented", "accelerated", "decorating", "going", "does"]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print(lemmatized_words)

#### 4. Stopwords Removal:

In [None]:
# Removing Stopwords:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words("english"))
words = ["this", "is", "an", "example"]
filtered_words = [word for word in words if word not in stop_words]
print(filtered_words) # Output: ['example']

#### 5. Removing Noise

In [None]:
# Noise Removal:
import re   #Using regex

text = "AI123 is transforming the world! @OpenAI"
clean_text = re.sub(r"[^a-zA-Z\s]", "", text)
print(clean_text) #Output: AI is transforming the world OpenAI

In [None]:
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Step 1: Read, clean, and tokenize a text file
def read_and_tokenize(file_path):
    try:
        with open(file_path, 'r') as file:
            text = file.read()

        # Clean the text (remove punctuation and convert to lowercase)
        clean_text = text.translate(str.maketrans('', '', string.punctuation)).lower()
        words = clean_text.split()
        return words
    except FileNotFoundError:
        return "File not found. Please provide a valid file path."

# Step 2: Remove stopwords and punctuation
def remove_stopwords_and_punctuation(words):
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

# Step 3: Generate a simple prompt
def generate_prompt(cleaned_text):
    prompt = f"Based on the following input, provide insights: {cleaned_text}"
    return prompt

# Example workflow
file_path = "example.txt"  # Replace with your file path
tokens = read_and_tokenize(file_path)
cleaned_text = remove_stopwords_and_punctuation(tokens)
prompt = generate_prompt(cleaned_text)

print("Generated Prompt:", prompt)
