In [1]:
import pandas as pd
from langchain_community.document_loaders import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import CSVLoader
from langchain_core.documents import Document
import os

## 筛选跟每个service program 相关的posts

In [2]:
# Directory path
directory_path = 'senti_results_SZ'
all_docs = []

# Iterate through CSV files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        
        # Use pandas to read the CSV and preserve row information
        df = pd.read_csv(file_path)
        
        # Convert each row to a Document with appropriate metadata
        for index, row in df.iterrows():
            # Convert row to string representation
            content = row.to_string()
            # Create metadata with file source and row index
            metadata = {
                'source': file_path,
                'row': index,
                # You can add other metadata from your CSV if needed
            }
            doc = Document(page_content=content, metadata=metadata)
            all_docs.append(doc)

print(f"Loaded {len(all_docs)} documents (rows) from CSVs")

Loaded 58738 documents (rows) from CSVs


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
texts_all = text_splitter.split_documents(all_docs)

In [4]:
hg_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

#### Create VectorDB

In [None]:
# persist_directory = 'docs/chroma_rag/'
# posts_langchain_chroma = Chroma.from_documents(
#     documents=texts_all,
#     collection_name="SZ_posts_senti",
#     embedding=hg_embeddings,
#     persist_directory=persist_directory,
#     collection_metadata={"hnsw:space": "cosine"}
# )

#### Load VectorDB

In [None]:
persist_directory = 'docs/chroma_rag/'
posts_langchain_chroma = Chroma(
    collection_name="SZ_posts_senti",
    persist_directory=persist_directory,
    embedding_function=hg_embeddings,
    collection_metadata={"hnsw:space": "cosine"} # this make sure the similarity socre between [0,1]
)

#### Description Data for Service Program

In [None]:
pd.read_csv("service_program_data/SPD_SZ_zh.csv").head()

In [None]:
import pandas as pd
from IPython.display import display, HTML

# Load the CSV file
service_program_df = pd.read_csv('service_program_data/SPD_SZ_zh.csv')

# Create a list to store results
results = []

# Iterate through each row of the CSV
for index, row in service_program_df.iterrows():
    # Concatenate the first three columns as service_dimension
    service_dimension = ' '.join([str(row[col]) for col in service_program_df.columns[:3]])
    
    # Search for similar documents in the vector database
    docs = posts_langchain_chroma.similarity_search(service_dimension, k=100)  # You can adjust k as needed
    
    # Store the results
    result = {
        'service_dimension': service_dimension,
        'similar_documents': docs
    }
    results.append(result)
    
    # Print progress (optional)
    if index % 10 == 0:
        print(f"Processed {index} rows")

# Display some results (for example, the first one)
if results:
    for i in range(len(results)):
      print(f"Service dimension: {results[i]['service_dimension']}")
      print("\nSimilar documents:")
      for j, doc in enumerate(results[i]['similar_documents']):
          if j < 10:
            print(f"\nDocument {j+1}:")
            print(f"Content: {doc.page_content[200:]}...")  # Display first 200 chars
            print(f"Metadata: {doc.metadata}")
            print(f"Metadata: {doc.metadata.get('source', 'N/A')}, Row: {doc.metadata.get('row', 'N/A')}")
      print("\n")

In [None]:
df = pd.read_csv('senti_results_SZ/senti_cleaned_深圳 地铁 201901 1.0.csv')
df.head(5)

In [14]:
import pandas as pd
from IPython.display import display, HTML
import os
import glob


# Create directory for output if it doesn't exist
output_dir = 'service_program_matches_SZ'
os.makedirs(output_dir, exist_ok=True)

# Load the service program CSV file
service_program_df = pd.read_csv('service_program_data/SPD_SZ_zh.csv')

# Create a cache for the source dataframes to avoid reloading them
source_df_cache = {}

# Iterate through each row of the CSV
for index, row in service_program_df.iterrows():
    # Concatenate the first three columns as service_dimension
    service_dimension = ' '.join([str(row[col]) for col in service_program_df.columns[:3]])
    
    # Search for similar documents in the vector database
    docs = posts_langchain_chroma.similarity_search_with_relevance_scores(service_dimension, k=300, score_threshold=0.2)    
    # Create a dictionary to group matched rows by source file
    matches_by_source = {}
    
    # Process each matched document
    # each doc is a tuple
    for doc in docs:
        source = doc[0].metadata.get('source', None)
        row_index = doc[0].metadata.get('row', None)
        
        if source and row_index is not None:
            # Convert row to integer if it's a numeric string
            try:
                row_index = int(row_index)
            except (ValueError, TypeError):
                continue
                
            if source not in matches_by_source:
                matches_by_source[source] = []
            
            matches_by_source[source].append(row_index)
    
    # Create a list to store all matching rows from original sources
    all_matched_rows = []
    
    # For each source file, get the matching rows
    for source, row_indices in matches_by_source.items():
        # Load the source file if not in cache
        if source not in source_df_cache:
            try:
                source_df_cache[source] = pd.read_csv(source)
            except Exception as e:
                print(f"Error loading {source}: {e}")
                continue
        
        source_df = source_df_cache[source]
        
        # Get rows from original source file
        for row_idx in row_indices:
            try:
                if 0 <= row_idx < len(source_df):
                    row_data = source_df.iloc[row_idx].to_dict()
                    row_data['original_source'] = source
                    row_data['original_row'] = row_idx
                    all_matched_rows.append(row_data)
            except Exception as e:
                print(f"Error accessing row {row_idx} in {source}: {e}")
    
    # Convert to DataFrame
    if all_matched_rows:
        matches_df = pd.DataFrame(all_matched_rows)
        
        # Create a filename for this service program
        safe_filename = f"service_program_{index}_matches.csv"
        
        # Save to CSV
        matches_df.to_csv(os.path.join(output_dir, safe_filename), index=False)
    
    # Print progress
    if index % 10 == 0:
        print(f"Processed {index} rows")

print(f"Completed! All match files saved to {output_dir} directory.")

# Display summary of what was created
print(f"Created CSV files with matching posts:")
for i, filename in enumerate(sorted(os.listdir(output_dir))):
    if i < 10:  # Show first 10 examples
        try:
            match_df = pd.read_csv(os.path.join(output_dir, filename))
            print(f"- {filename}: {len(match_df)} matched original posts")
        except:
            print(f"- {filename}: Could not read file")

Processed 0 rows
Processed 10 rows
Processed 20 rows
Completed! All match files saved to service_program_matches_SZ directory.
Created CSV files with matching posts:
- service_program_0_matches.csv: 4 matched original posts
- service_program_10_matches.csv: 300 matched original posts
- service_program_11_matches.csv: 300 matched original posts
- service_program_12_matches.csv: 180 matched original posts
- service_program_13_matches.csv: 300 matched original posts
- service_program_14_matches.csv: 300 matched original posts
- service_program_15_matches.csv: 300 matched original posts
- service_program_16_matches.csv: 300 matched original posts
- service_program_17_matches.csv: 5 matched original posts
- service_program_18_matches.csv: 6 matched original posts


In [None]:
posts_langchain_chroma._cosine_relevance_score_fn