# **Assessment for Gen AI/Prompt Engineering Role**


# **Problem Statement 1: Natural Language Processing (NLP)**
Problem: Implement a function to preprocess and tokenize text data.

In [1]:
text = "Hello, world! This is a sample text for Preprocessing & Tokenization."

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_and_tokenize(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return filtered_tokens

# Example usage
text = "Hello, world! This is a sample text for Preprocessing & Tokenization."
tokens = preprocess_and_tokenize(text)
print(tokens)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['hello', 'world', 'sample', 'text', 'preprocessing', 'tokenization']


# **Problem Statement 2: Text Generation**
Problem: Create a basic text generation model using a pre-trained transformer (e.g., GPT-3).

In [3]:
prompt = "In a distant future, humanity has reached the stars, and"


In [5]:
from transformers import pipeline

def generate_text(prompt, model_name="gpt-2"):
    # Load the pre-trained model and tokenizer from Hugging Face
    generator = pipeline("text-generation", model=model_name)

    # Generate text based on the provided prompt
    result = generator(prompt, max_length=50, num_return_sequences=1)

    # Return the generated text
    return result[0]["generated_text"]

# Example usage
prompt = "In a distant future, humanity has reached the stars, and"
generated_text = generate_text(prompt, model_name="gpt-2")
print(generated_text)


OSError: gpt-2 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

# **Problem Statement 3: Prompt Engineering**
Problem: Design and evaluate prompts to improve the performance of a given AI model on a specific task (e.g., summarization, question answering).

In [6]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score

# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased")

def evaluate_prompt(prompt_template, question, context):
    # Format the prompt
    prompt = prompt_template.replace("[question]", question)

    # Get the answer from the model
    result = qa_pipeline(question=prompt, context=context)

    return result["answer"]

# Example usage with different prompts
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

prompts = [
    "Answer the following question: [question]",
    "Based on the information provided, what is the answer to the question: [question]",
    "What is [question]?"
]

# Evaluate each prompt
for prompt in prompts:
    answer = evaluate_prompt(prompt, question, context)
    print(f"Prompt: {prompt}\nAnswer: {answer}\n")

# Evaluation: Use metrics like accuracy or F1 score based on a test dataset to compare the effectiveness.


config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



Prompt: Answer the following question: [question]
Answer: . The capital

Prompt: Based on the information provided, what is the answer to the question: [question]
Answer: . The capital

Prompt: What is [question]?
Answer: . The capital of France is Paris



# **Problem Statement 4: Data Analysis**
Problem: Analyze a dataset and generate insights using a combination of descriptive statistics and visualizations.

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv("titanic.csv")

# Display the first few rows of the dataset
data.head()

# Descriptive Statistics
desc_stats = data.describe()
print(desc_stats)

# Handling Missing Values
data = data.dropna(subset=["Age", "Embarked"])  # Example: Dropping rows with missing 'Age' and 'Embarked'

# Visualization: Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Age'], bins=30, kde=True)
plt.title("Age Distribution of Passengers")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

# Visualization: Survival Rate by Gender
plt.figure(figsize=(8, 6))
sns.barplot(x="Sex", y="Survived", data=data)
plt.title("Survival Rate by Gender")
plt.xlabel("Gender")
plt.ylabel("Survival Rate")
plt.show()

# Additional Analysis: Correlation Heatmap
plt.figure(figsize=(12, 8))
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Insights: Interpretation of the findings
# Example: Analysis shows that younger passengers had a higher survival rate, and females had a higher survival rate than males.


FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'

# **Problem Statement 5: Live Coding Session - API Integration**
Problem: Develop a Python script to integrate with an external API and fetch data based on user input

In [8]:
import requests

def get_weather(city_name):
    api_key = "your_api_key_here"  # Replace with your actual API key
    base_url = "http://api.openweathermap.org/data/2.5/weather"

    # Construct the full API URL with parameters
    params = {
        "q": city_name,
        "appid": api_key,
        "units": "metric"  # Fetch temperature in Celsius
    }

    try:
        # Make the API call
        response = requests.get(base_url, params=params)

        # Check if the request was successful
        response.raise_for_status()

        # Parse the JSON response
        weather_data = response.json()

        # Extract and display relevant information
        city = weather_data["name"]
        temperature = weather_data["main"]["temp"]
        description = weather_data["weather"][0]["description"]

        print(f"Weather in {city}:")
        print(f"Temperature: {temperature}°C")
        print(f"Description: {description.capitalize()}")

    except requests.exceptions.HTTPError as http_err:
        if response.status_code == 404:
            print("City not found. Please check the city name and try again.")
        else:
            print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError:
        print("Network error. Please check your internet connection and try again.")
    except requests.exceptions.Timeout:
        print("The request timed out. Please try again later.")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred: {err}")

if __name__ == "__main__":
    city_name = input("Enter city name: ")
    get_weather(city_name)


Enter city name: chennai
HTTP error occurred: 401 Client Error: Unauthorized for url: http://api.openweathermap.org/data/2.5/weather?q=chennai&appid=your_api_key_here&units=metric
