### **Import**

In [1]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    import os
    import json
    import vector_db
    import pandas as pd
    import search_rerank
    import query_generation
    import answer_generation
    from openai import OpenAI
    import weakness_clustering
    import suggestion_creation
    import resource_preprocessing
    import weakness_identification
    import suggestion_postprocessing
    from qdrant_client import QdrantClient
    from googleapiclient.discovery import build
    from sentence_transformers import SentenceTransformer
    from sentence_transformers.cross_encoder import CrossEncoder
    from grobid_client_python.grobid_client.grobid_client import GrobidClient

### **General Settings**

In [2]:
path_tweets_raw = ''
path_tweets_raw_shuffled = ''
path_tweets_clean_shuffled = ''

path_abstracts = ''

path_suggestions = ''

#Vector_db
abstract_collection_name = 'abstract_collection'
tweet_collection_name = 'tweet_collection'
limit_results = 10
limit_results_rerank = 10

#Sentence_transformer models
embedding_model = 'all-MiniLM-L6-v2'
cluster_embedding_model = 'all-mpnet-base-v2'
cross_encoder_model = 'ms-marco-MiniLM-L-6-v2'

#OpenAI models
generative_model = "gpt-4-0125-preview"
cluster_embedder_gpt = "text-embedding-3-large"
openai_api_key=""

#Semnatic Scholar
x_api_key = ''

#Google
GOOGLE_CSE_ID = ""
GOOGLE_API_KEY = ""
service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)

#Setup Models
os.environ["OPENAI_API_KEY"] = openai_api_key
openAI_client = OpenAI()
embedder = SentenceTransformer(embedding_model, device="cuda")
cluster_embedder = SentenceTransformer(cluster_embedding_model, device="cuda")
cross_encoder = CrossEncoder(f"cross-encoder/{cross_encoder_model}", device="cuda")

qdrantdb_client = QdrantClient(host="localhost", grpc_port=6334, prefer_grpc=True)

In [3]:
#Run Grobid

client = GrobidClient(config_path="grobid_client_python/config.json")
GROBID_URL = 'http://localhost:8070'
url_setting = '%s/api/processFulltextDocument' % GROBID_URL

GROBID server is up and running


### **Shuffle and clean tweet dataset**

In [None]:
start_year = 2014 #only tweets after this year
end_year = 2022 #only tweets before this year

#Create shuffled dataset with tweets > start_year and tweets < end_year
resource_preprocessing.prepare_tweet_dataset(path_tweets_raw, path_tweets_raw_shuffled, start_year, end_year, 'id', 'text', 'created_at')

#Clean tweet dataset
resource_preprocessing.clean_tweets(path_tweets_raw_shuffled, path_tweets_clean_shuffled, 'text')

### **Creating the initial KB**

**Create vector store with abstracts**

In [None]:
#Create Qdrant collection and upload abstracts
vector_db.create_db(path_abstracts, True, 'corpusid', 'abstract', qdrantdb_client, abstract_collection_name, embedder)

**Create vector store with tweets**

In [None]:
#Create Qdrant collection and upload tweets
vector_db.create_db(path_tweets_clean_shuffled, False, 'id', 'text_clean', qdrantdb_client, tweet_collection_name, embedder)

## **Suggestion Generation**

**Load Tweets (after shuffling and cleaning)**

In [4]:
tweets = pd.read_json("/home/ubuntu/SuggestionGeneration/MTurk/Suggestion_Annotation/output_CX_suggestions_cleaned.json", lines=True)

**Preprocessing Tweets**

In [5]:
tweets = resource_preprocessing.preprocessing(tweets, source_column, text_column)

**Process Weakness Identification**

In [5]:
tweet_weakness_batch, weakness_batch, number_excepts = weakness_identification.identify_weaknesses(tweets, openAI_client, generative_model)

**Weakness Clustering**

*SentenceTransformers*

In [None]:
#Group the identified weaknesses
weakness_cluster_batch = weakness_clustering.get_clusters(weakness_batch, cluster_embedder, cluster_min_size = 1, cluster_threshold=0.65)

**Search Query Generation**

In [21]:
cluster_queries_batch = query_generation.get_search_queries(weakness_cluster_batch, openAI_client, generative_model, cluster_max_size=10)

**Individual Improvement Suggestion Generation**

In [None]:
reranked_query_results = []
improvement_suggestions = []

for idx, query in enumerate(cluster_queries_batch['search_query'].to_list()):
    query_results = search_rerank.retrieve(query, qdrantdb_client, tweet_collection_name, abstract_collection_name, embedder, url_setting, x_api_key, service, GOOGLE_CSE_ID, limit_results)
    reranked_query_results.append(search_rerank.rerank(cross_encoder, query, query_results, limit_results_rerank))
    improvement_suggestions.append(suggestion_creation.get_suggestions(query, reranked_query_results[idx], openAI_client, generative_model))

cluster_queries_batch['suggestions'] = improvement_suggestions
cluster_queries_batch['reranked'] = reranked_query_results

In [25]:
#Add suggestions to tweets
weakness_cluster_batch, cluster_queries_batch, tweet_weakness_batch = suggestion_postprocessing.postprocessing(weakness_cluster_batch, cluster_queries_batch, tweet_weakness_batch)

**Merging Suggestions for each Tweet**

In [26]:
tweet_weakness_batch["answer"] = answer_generation.get_answer(tweet_weakness_batch, openAI_client, generative_model)

**Saving Generations**

In [27]:
tweet_weakness_batch.to_pickle("")
cluster_queries_batch.to_pickle("")
weakness_cluster_batch.to_pickle("")