In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
DATA_PATH = Path() / "../data"
#DATA_PATH = Path('/content/gdrive/My Drive/Colab Notebooks/VC_data_Cleaning_With_LLM/data')
DATA_PATH.mkdir(parents=True,exist_ok=True)

def load_data(filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    return pd.read_csv(csv_path,encoding=encoding)

def save_data(data, filename, data_path=DATA_PATH,encoding='ISO-8859-1'):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False,encoding='ISO-8859-1')

PLOT_PATH = Path() / "../plot"
#PLOT_PATH = Path('/content/gdrive/My Drive/Colab Notebooks/VC_data_Cleaning_With_LLM/plot')
PLOT_PATH.mkdir(parents=True,exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300, transparent=True):
    path = PLOT_PATH / f"{fig_id}.{fig_extension}"
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution, transparent=transparent)

RESULT_PATH = Path() / "../result"
#PLOT_PATH = Path('/content/gdrive/My Drive/Colab Notebooks/VC_data_Cleaning_With_LLM/plot')
RESULT_PATH.mkdir(parents=True,exist_ok=True)
def save_result(data, filename, data_path=RESULT_PATH):
    csv_path = data_path / filename
    data.to_csv(csv_path, index=False, encoding='utf-8-sig')


In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from kor.extraction import create_extraction_chain
from kor.nodes import Object,Text, Number

In [3]:
import json
def json_dump(json_object):
    json_formatted_str = json.dumps(json_object, indent=2,ensure_ascii= False)
    print(json_formatted_str)

In [4]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [5]:
# Import the openai module
import openai

In [6]:
from typing import Optional

from langchain_core.pydantic_v1 import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    role: Optional[str] = Field(
        ..., description="The role or title of this person. "
                                )
    company: Optional[str] = Field(
        ..., description="The company this person is working in."
    )
    ai_related: Optional[bool] = Field(
        ..., description="Whether this person is related to AI."
    )

In [7]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract"
            "return null for the attribute's value."

            "The text we input is a person's biography. "
            "You should treat every input text as a separate entity."

            "For each person's biography, you should consider:"
            "If this person has several roles, we should only annotate one."
            "If this person is a AI company founder or co-founder, annotate him as a co-founder."
            "If this person is a researcher in AI feild, annotate this person as a researcher."
            "If you can't find any role, try to conclude the role from the context, if you still can't find any role, please annotate the role as NA."

            "For all the biographies with roles as NA, please read the context and try to judge if the biography is from a enterprise account's biography. "
            "If it is, please annotate the role as 'Y'. If you can't judge, keep the role annotation as 'NA'."

            "when you extract company name, please extract the full name of the company and exclude any other punctuation mark such as @."
            "If there is no company name, please annotate the company name as NA."

            "If this person is related to AI, or anything about AI, please annotate the AI_related as true, otherwise annotate it as false."

        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

In [8]:
from typing import List, Optional
class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people: List[Person]

In [9]:
import uuid
from typing import Dict, List, TypedDict
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)

class Example(TypedDict):
    """A representation of an example consisting of text input and expected tool calls.

    For extraction, the tool calls are represented as instances of pydantic model.
    """

    input: str  # This is the example text
    tool_calls: List[BaseModel]  # Instances of pydantic model that should be extracted


In [11]:
def tool_example_to_messages(example: Example) -> List[BaseMessage]:
    """Convert an example into a list of messages that can be fed into an LLM.

    This code is an adapter that converts our example to a list of messages
    that can be fed into a chat model.

    The list of messages per example corresponds to:

    1) HumanMessage: contains the content from which content should be extracted.
    2) AIMessage: contains the extracted information from the model
    3) ToolMessage: contains confirmation to the model that the model requested a tool correctly.

    The ToolMessage is required because some of the chat models are hyper-optimized for agents
    rather than for an extraction use case.
    """
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    # The name of the function right now corresponds
                    # to the name of the pydantic model
                    # This is implicit in the API right now,
                    # and will be improved over time.
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages

In [12]:
examples = [
    (
        "Research Scientist at Google DeepMind. I tweet about deep learning (research + software), music, generative models (personal account).",
        Person(role="Research Scientist", company="Google DeepMind", interest="deep learning, music, and generative models", ai_related=True),
    ),
    (
        "I make videos.\nSkill &gt; Destiny.\nvi / vim",
        Person(role=None, company=None, interest=None, ai_related=False),
    ),
    (
        "ML Researcher, co-leading Superalignment @OpenAI. Optimizing for a post-AGI future where humanity flourishes.",
        Person(role="ML Researcher", company="OpenAI", interest="AGI", ai_related=True),
    )
]


messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages({"input": text, "tool_calls": [tool_call]})
    )

### Create an extractor

In [13]:
llm = ChatOpenAI(
    model_name = 'gpt-4-turbo-preview',
    temperature = 0,
    openai_api_key = OPENAI_API_KEY
)

In [14]:
runnable = prompt | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)

  warn_beta(


In [26]:
df = load_data('twitter_combined.csv')
# Drop NaN values from the 'bio' column
df = df.dropna(subset=['bio'])

# Combine all text in the 'bio' column into a list, considering only the first 100 texts
documents = df['bio'].head(10).tolist()
#documents = df['bio'].tolist()

In [15]:
documents

['Making medicines differently to bring better drugs faster to the patients who benefit most with ML and data at scale.',
 '#LearnWithoutLimits on Coursera. Access online courses and degrees from world-class universities and companies. Visit https://t.co/iZWWGG1ypn for support.',
 'The leading news source for higher education. Get our newsletters: https://t.co/zfAnZfGsHc',
 "Sharing things I'm learning through my foundation work and other interests.",
 'Policy research at @openai. I mostly tweet about AI, animals, and sci-fi. He/him. Views my own.',
 'Co-Founder of OpenAI',
 'Chief Scientist, Google DeepMind and Google Research. Co-designer/implementor of things like @TensorFlow, MapReduce, Bigtable, Spanner, Gemini .. (he/him)',
 'VP of Research & Deep Learning Lead, Google DeepMind. Gemini co-lead.\n\nPast: AlphaStar, AlphaFold, AlphaCode, WaveNet, seq2seq, distillation, TF.',
 '@AnthropicAI, ONEAI OECD, co-chair @indexingai, writer @ https://t.co/3vmtHYkaTu Past: @openai, @business 

In [16]:
text='''
  Policy research at @openai. 
  I mostly tweet about AI, animals, and sci-fi. He/him. Views my own.
  Chief Scientist, Google DeepMind and Google Research. Co-designer/implementor of things like @TensorFlow, MapReduce, Bigtable, Spanner, Gemini .. (he/him)
'''
texts = ['Making medicines differently to bring better drugs faster to the patients who benefit most with ML and data at scale.',
 '#LearnWithoutLimits on Coursera. Access online courses and degrees from world-class universities and companies. Visit https://t.co/iZWWGG1ypn for support.',
 'The leading news source for higher education. Get our newsletters: https://t.co/zfAnZfGsHc']

text = documents

output = runnable.invoke({"text": text, "examples": messages})

In [27]:
output

NameError: name 'output' is not defined

In [28]:
# Extract information to a list of dictionaries
people_data = [{"role": person.role, "company": person.company, "ai_related": person.ai_related} for person in output.people]

# Convert to DataFrame
bio_df = pd.DataFrame(people_data)

bio_df.head(40)

NameError: name 'output' is not defined

In [15]:
df_new = load_data('twitter_combined.csv')
# Drop NaN values from the 'bio' column
df_new = df_new.dropna(subset=['bio'])
# Sort the DataFrame by 'tweetsCount' in descending order
df_new = df_new.sort_values(by='followersCount', ascending=False)


In [16]:
# Apply the function to remove illegal XML characters to every string in the DataFrame
# and save directly to an Excel file without the intermediate cleaning step.
def clean_and_save_dataset(dataframe):
    # Define the function to remove illegal XML characters
    def remove_illegal_xml_characters(s):
        if isinstance(s, str):
            return ''.join(char for char in s if ord(char) >= 32 or char in '\t\n\r')
        else:
            return s
    
    # Apply the cleaning function across the DataFrame
    cleaned_df = dataframe.applymap(remove_illegal_xml_characters)
    return cleaned_df

In [17]:
df_new = clean_and_save_dataset(df_new)

In [76]:
df_new.shape

(7390, 18)

In [19]:
#df_new = df_new.iloc[:120]

In [34]:
df_new.head()

Unnamed: 0,profileUrl,screenName,userId,name,imgUrl,backgroundImg,bio,website,location,createdAt,followersCount,friendsCount,tweetsCount,certified,following,followedBy,query,timestamp
1300,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913202,543248,17013,True,,,https://twitter.com/demishassabis,2024-04-08T02:19:49.116Z
29,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913188,543248,17013,True,,,https://twitter.com/mustafasuleyman,2024-04-08T02:18:03.055Z
4962,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913181,543248,17013,True,,,https://twitter.com/kaifulee,2024-04-08T02:16:54.116Z
4282,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913149,543248,17013,True,,,https://twitter.com/AndrewYNg,2024-04-08T02:13:54.721Z
5787,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913132,543248,17013,True,,,https://twitter.com/drfeifei,2024-04-08T02:12:18.820Z


In [18]:
chunk_size = 5
chunks = [df_new.iloc[i:i + chunk_size  ] for i in range(0, df_new.shape[0], chunk_size)]

In [77]:
print(len(chunks))
print(df_new.shape[0])
print(df_intermediate.shape)
chunks[-2].shape

739
7390
(7245, 3)


(10, 18)

In [19]:
df_intermediate = pd.DataFrame()

In [None]:
# Process each chunk and store the results
processed_chunks = 0
# extracted_info_list = pd.DataFrame()
saved_dataframes = {}
for i, chunk in enumerate(chunks):
#for i, chunk in enumerate(chunks[:119], start=119):
    chunk_text = chunk['bio'].tolist()
    for text in chunk_text:
    
        extracted_info = runnable.invoke({"text": text, "examples": messages})
    
        bio_data = [{"role": person.role, "company": person.company, "ai_related": person.ai_related} for person in extracted_info.people]
        bio_df_new = pd.DataFrame(bio_data)

        # extracted_info_list = pd.concat([extracted_info_list, bio_df_new])
        df_intermediate = pd.concat([df_intermediate, bio_df_new])
    processed_chunks += 1
    print(f'Processed {processed_chunks} chunks.')

    print(bio_df_new.shape[0])
    
    # Save intermediate results every 900 chunks
    # if processed_chunks % 30 == 0:
    #     df_intermediate = pd.concat([df_intermediate, bio_df_new])
    #     saved_dataframes[f'dataframe222_{processed_chunks}'] = bio_df_new.copy()
    #     print(f'Saved dataframe222_{processed_chunks} in memory.')

    #     intermediate_filename = f'intermediate_results222_{processed_chunks}.csv'
    #     save_result(bio_df_new, intermediate_filename)
    #     print(f'Saved {intermediate_filename}.')

In [39]:
df_intermediate.head()

Unnamed: 0,role,company,ai_related
0,President,,False
1,President,,False
2,President,,False
3,President,,False
4,President,,False


In [22]:
extracted_info_list

Unnamed: 0,role,company,ai_related
0,President,,False
1,President,,False
2,President,,False
3,President,,False
4,President,,False
...,...,...,...
25,Founder,"FSecure, WithSecure",False
26,research scientist,DeepMind,True
27,research scientist,DeepMind,True
28,research scientist,DeepMind,True


In [40]:
save_result(df_intermediate, 'result.csv')

In [64]:
df_intermediate.shape 

(7245, 3)

In [65]:
df_new = df_new.reset_index(drop=True)
extracted_info_list = df_intermediate.reset_index(drop=True)  # Assuming it's a DataFrame
result = pd.concat([df_new, extracted_info_list], axis=1)
extracted_info_list.shape

(7245, 3)

In [66]:
result = pd.concat([df_new, extracted_info_list], axis=1)
print(extracted_info_list.shape)
print(df_new.shape)
print(result.shape)

(7245, 3)
(7390, 18)
(7390, 21)


In [67]:
result.shape

(7390, 21)

In [68]:
save_result(result, 'result.csv')

In [None]:
result = pd.read_csv('../result/twitter_extract_first_60_2.csv')
result

In [70]:
df_new.head(-60)

Unnamed: 0,profileUrl,screenName,userId,name,imgUrl,backgroundImg,bio,website,location,createdAt,followersCount,friendsCount,tweetsCount,certified,following,followedBy,query,timestamp
0,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913202,543248,17013,True,,,https://twitter.com/demishassabis,2024-04-08T02:19:49.116Z
1,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913188,543248,17013,True,,,https://twitter.com/mustafasuleyman,2024-04-08T02:18:03.055Z
2,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913181,543248,17013,True,,,https://twitter.com/kaifulee,2024-04-08T02:16:54.116Z
3,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913149,543248,17013,True,,,https://twitter.com/AndrewYNg,2024-04-08T02:13:54.721Z
4,https://twitter.com/BarackObama,BarackObama,813286,Barack Obama,https://pbs.twimg.com/profile_images/132964752...,https://pbs.twimg.com/profile_banners/813286/1...,"Dad, husband, President, citizen.",barackobamabooks.com,"Washington, DC",Mon Mar 05 22:08:25 +0000 2007,131913132,543248,17013,True,,,https://twitter.com/drfeifei,2024-04-08T02:12:18.820Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7325,https://twitter.com/derek_zu,derek_zu,27884821,Derek Zu,https://pbs.twimg.com/profile_images/914233219...,,"Father, off-road enthusiast, Strategy and prod...",,"San Jose, CA",Tue Mar 31 14:50:42 +0000 2009,182,409,381,False,,,https://twitter.com/AndrewYNg,2024-04-08T02:13:51.667Z
7326,https://twitter.com/hao_ssr,hao_ssr,824426964140249089,Hao Sheng,https://pbs.twimg.com/profile_images/133791576...,,@Stanford CS/ICME PhD student of @5harad and @...,,,Thu Jan 26 01:21:20 +0000 2017,181,121,20,False,,,https://twitter.com/AndrewYNg,2024-04-08T02:13:50.237Z
7327,https://twitter.com/Sohilnewa,Sohilnewa,207837134,Sohil Newa,https://pbs.twimg.com/profile_images/788611411...,https://pbs.twimg.com/profile_banners/20783713...,Introvert || PhD Student at @utarlington || Ma...,linkedin.com/in/sohilshrestâ¦,,Tue Oct 26 03:44:46 +0000 2010,179,82,920,False,,,https://twitter.com/goodfellow_ian,2024-04-08T02:15:22.117Z
7328,https://twitter.com/arnaud_autef,arnaud_autef,1169004884982730753,Arnaud Autef,https://pbs.twimg.com/profile_images/117697332...,https://pbs.twimg.com/profile_banners/11690048...,"Machine Learning Engineer @apple, ex @sisudata...",arnaudautef.com,"Menlo Park, CA",Tue Sep 03 21:51:09 +0000 2019,178,269,613,False,,,https://twitter.com/goodfellow_ian,2024-04-08T02:15:18.304Z


In [None]:
# Process each chunk and store the results
extracted_info_list = []
for chunk in chunks:
    chunk_text = chunk['bio'].tolist()
    extracted_info =  runnable.invoke({"text": chunk, "examples": messages})
    bio_data = [{"role": person.role, "company": person.company, "ai_related": person.ai_related} for person in extracted_info.people]
    bio_df_new = pd.DataFrame(bio_data)
    extracted_info_list.append(bio_df_new)


In [None]:
# Combining the extracted information from all chunks
extracted_info_combined = pd.concat(extracted_info_list).reset_index(drop=True)


In [None]:
# Assuming the order of rows hasn't changed, directly add the extracted info to the DataFrame
df_new.loc[:, 'role'] = extracted_info_combined['role']
df_new.loc[:, 'company'] = extracted_info_combined['company']
df_new.loc[:, 'ai_related'] = extracted_info_combined['ai_related']


In [None]:
df_new.head(20)