## High level steps

As part of the session we will perform following steps to perform data preparation

1. Downloading our IMDB dataset
2. Will perform text processing
3. Generate the embeddings
4. Create and store to Index
5. Call GPT model

In [None]:
#Pandas library to manage dataframes
import pandas as pd
#import numpy for plotting
import numpy as np
#Import the stop words library
from nltk.corpus import stopwords
#Import the nltk library
import nltk
#library for regular expression
import re
#library for parsing html
from bs4 import BeautifulSoup
#library for progress bar
from tqdm import tqdm

### Step 1: Download the IMDB dataset

In [None]:
#specify path of file
url="https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv"
# Reading a file
imdb_data=pd.read_csv(url,encoding='latin-1') 

In [None]:
#getting the top 10 records
imdb_data.head(10)

### Step 2: Will perform text processing
- As part of text pre processing we will be expanding the words like won't to will not
- Changing the case to lower case
- Stripping all http addresses
- Extracting text from HTML tag
- Stripping sentence from all digits and other characters

In [None]:
#downloading the stop words
nltk.download('stopwords')
#We will set stop words that we plan to use as english
stop_words = set(stopwords.words('english'))

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def pre_process(data):
    preprocessed_reviews = []
    # tqdm is for printing the status bar
    for sentance in tqdm(data):
        sentance = sentance.lower()
        #remove http address
        sentance = re.sub(r"http\S+", "", sentance)
        #extracting text from html tags
        sentance = BeautifulSoup(sentance, 'lxml').get_text()
        #expanding the words
        sentance = decontracted(sentance)
        #remove all numbers
        #sentance = re.sub("\S*\d\S*", "", sentance).strip()
        #replacing all characters other than a-z with white space
        #sentance = re.sub('[^A-Za-z]+', ' ', sentance)
        #removing stop words
        #sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in stop_words)
        preprocessed_reviews.append(sentance.strip())
    return preprocessed_reviews

In [None]:
#Call the prerpocess method
preprocessed_reviews = pre_process(imdb_data['text'].values)

In [None]:
#convert the extracted data into data frame
preprocessed_imdb_data = pd.DataFrame({'preprocessed_text':preprocessed_reviews, 'polarity': imdb_data['polarity'] })

In [None]:
#Lets see the first few records
preprocessed_imdb_data = preprocessed_imdb_data[:500]
preprocessed_imdb_data.head()

### Step 3: Generate Embeddings

In [None]:
# Import required libraries  
import time
import os  
import json  
import openai  
from openai.embeddings_utils import get_embedding, cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  

# models
EMBEDDING_MODEL = "text-embeddings-ada-002"

# Configure environment variables  
service_endpoint = "https://s360-azcognitivesearch-dev.search.windows.net" 
openai.api_type = "azure"  
openai.api_base = "https://pwopenai.openai.azure.com" 
openai.api_version = "2023-05-15"  

cognitiveSearchKey = 'abc'
credential = AzureKeyCredential(cognitiveSearchKey)
openai.api_key = 'xyz'

In [None]:
def get_embeddings_with_retry(df, text_column, engine):
    embeddings = []
    for text in df[text_column]:
        while True:
            try:
                embedding = get_embedding(text, engine=engine)
                embeddings.append(embedding)
                break
            except Exception as e:
                print(f"Error: {e.message}. Retrying after 1 second...")
                time.sleep(1)
                continue
    return embeddings

preprocessed_imdb_data['textVector'] = get_embeddings_with_retry(preprocessed_imdb_data, 'preprocessed_text', EMBEDDING_MODEL)
preprocessed_imdb_data['id'] =preprocessed_imdb_data.index.astype(str)

In [None]:
len(preprocessed_imdb_data[:1].textVector[0])

In [None]:
preprocessed_imdb_data.to_csv("preprocessed_imdb_data.csv", index=False)

In [None]:
# preprocessed_imdb_data = pd.read_csv("preprocessed_imdb_data.csv")
# preprocessed_imdb_data.head()

In [None]:
#check embeddings
preprocessed_imdb_data['polarity'] = preprocessed_imdb_data['polarity'].astype(str).replace({'1': 'True', '0': 'False'})
preprocessed_imdb_data.head()

In [None]:
preprocessed_imdb_data.info()

### Step 4: Create Search Index and store

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="preprocessed_text", type=SearchFieldDataType.String, filterable=True, retrievable=True, searchable=True),
    SearchableField(name="polarity", type=SearchFieldDataType.String, filterable=True, retrievable=True, searchable=True),
    SearchField(name="textVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="vector_config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="vector_config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="preprocessed_text"),
        prioritized_keywords_fields=[SemanticField(field_name="preprocessed_text"), SemanticField(field_name="polarity")],
        prioritized_content_fields=[SemanticField(field_name="preprocessed_text"), SemanticField(field_name="polarity")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name="movie-index", fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
index_client.delete_index(index)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

In [None]:
# Upload some documents to the index
documents = preprocessed_imdb_data.to_dict(orient='records')  
search_client = SearchClient(endpoint=service_endpoint, index_name="movie-index", credential=credential)
result = search_client.upload_documents(documents)  
print(f"Uploaded {len(documents)} documents") 

#### Test Hybrid Search

In [None]:
# search function
def strings_ranked_by_relatedness(
    query: str,
    top_n: int = 10
):
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    search_client = SearchClient(service_endpoint, index_name="movie-index", credential=credential)
    vector = Vector(value=get_embedding(query, engine=EMBEDDING_MODEL), k=10, fields="textVector")
  
    results = search_client.search(  
        search_text=query,  
        vectors= [vector],
        select=["preprocessed_text", "polarity"],
        query_type="semantic", query_language="en-us", semantic_configuration_name='semantic-config', query_caption="extractive", query_answer="extractive",
    )  
    content = []
    for i, result in enumerate(results):
        if i<top_n:
            data = {}
            data["Score"] = result['@search.score']
            data["Polarity"] = result['polarity']
            data["text"] = result['preprocessed_text']
            content.append(data)  
    return content


### Step 5. Call GPT model

In [None]:
def ask(query):
  lst = []
  content= strings_ranked_by_relatedness(query, top_n=5)
  completion = openai.ChatCompletion.create(
    engine = "gpt-35-turbo",
    temperature = 0.0,
    max_tokens = 2000,
    messages = [
        {"role": "system", "content": "You are a movie expert who uses the below data to answer questions. Do not reply anything outside the mentioned data."},
        {"role": "user", "content": " ".join([str(i) for i in content])+ "\n QUery: " + query}]
  )

  print(completion.choices[0].message.content)

In [None]:
# examples
ask("Tell me some worst reviews by people.")

## Test

In [None]:
imdb_data = imdb_data[:500]
imdb_data[imdb_data['text'].str.contains("Drew Barrymore")].text.to_list()