In [1]:
import pandas as pd
from langchain_community.document_loaders import DirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import CSVLoader
from langchain_core.documents import Document
import os

## 筛选跟每个service program 相关的posts

In [2]:
# Directory path
directory_path = 'senti_results_SZ'
all_docs = []

# Define the Shenzhen metro station names
sz_metro_stations = [
    "机场东", "机场", "宝安中心", "前海湾", "鲤鱼门", "大新", "桃园", "深大", "科苑", "白石洲", 
    "世界之窗", "华侨城", "侨城北", "香蜜湖", "车公庙", "竹子林", "招商银行大厦", "深康", "黄贝岭", 
    "黄贝", "新秀", "莲塘", "梧桐山南", "梧桐山", "盐田路", "沙头角", "海山", "罗湖", "国贸", 
    "老街", "大剧院", "科学馆", "华强路", "岗厦", "会展中心", "香蜜", "深南香蜜", "红树湾", "后海", 
    "南山", "科技园", "大学城", "桃源村", "龙珠", "龙华", "清湖", "碧海湾", "铁路公园", "西丽湖"
]

# Define keywords for filtering
keywords = ["深圳", "深铁"] + sz_metro_stations

# Iterate through CSV files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        
        # Use pandas to read the CSV and preserve row information
        df = pd.read_csv(file_path)
        
        # Check if "微博正文" column exists
        if "微博正文" not in df.columns:
            print(f"Warning: '微博正文' column not found in {file_path}. Skipping file.")
            continue
        
        # Filter rows where "微博正文" contains any of the keywords
        filtered_df = df[df["微博正文"].apply(lambda text: 
            any(keyword in str(text) for keyword in keywords) if pd.notna(text) else False
        )]
        
        # Convert each filtered row to a Document with appropriate metadata
        for index, row in filtered_df.iterrows():
            # Convert row to string representation
            content = row["微博正文"]
            # Create metadata with file source and row index
            metadata = {
                'source': file_path,
                'row': index,  # Keep original index from source file
                # You can add other metadata from your CSV if needed
            }
            doc = Document(page_content=content, metadata=metadata)
            all_docs.append(doc)

print(f"Loaded {len(all_docs)} documents (rows) from CSVs after filtering for Shenzhen metro related content")

Loaded 56847 documents (rows) from CSVs after filtering for Shenzhen metro related content


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
texts_all = text_splitter.split_documents(all_docs)

In [4]:
hg_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)

#### Create VectorDB

In [None]:
# persist_directory = 'docs/chroma_rag/'
# posts_langchain_chroma = Chroma.from_documents(
#     documents=texts_all,
#     collection_name="SZ_posts_senti",
#     embedding=hg_embeddings,
#     persist_directory=persist_directory,
#     collection_metadata={"hnsw:space": "cosine"}
# )

#### Load VectorDB

In [None]:
persist_directory = 'docs/chroma_rag/'
posts_langchain_chroma = Chroma(
    collection_name="SZ_posts_senti",
    persist_directory=persist_directory,
    embedding_function=hg_embeddings,
    collection_metadata={"hnsw:space": "cosine"} # this make sure the similarity socre between [0,1]
)

#### Description Data for Service Program

In [6]:
pd.read_csv("service_program_data/SPD_SZ_zh.csv").head()

Unnamed: 0,Related service quality,Service improvement programs,Objective,City,Period,Investments,Reference
0,舒适度-温度,同一车厢不同温度模式,,深圳,2020-01-01至2021-12-31,不适用,https://www.szmc.net/SMARTC/upload/file/202106...
1,信息服务,智能动态地图显示系统,,深圳,2020-01-01至2021-12-31,不适用,https://www.szmc.net/SMARTC/upload/file/202106...
2,信息服务,车厢拥挤度智能显示系统,智能拥挤度系统将在5条线路上推出：6、7、9、10和11号线,深圳,2020-01-01至2021-12-31,不适用,https://www.szmc.net/SMARTC/upload/file/202106...
3,配套设施-票务服务,实现BOM第三方支付,,深圳,2020-01-01至2021-12-31,不适用,https://www.szmc.net/SMARTC/upload/file/202106...
4,配套设施-票务服务,"成功推出""深圳通""乘车码",,深圳,2020-01-01至2021-12-31,不适用,https://www.szmc.net/SMARTC/upload/file/202106...


In [7]:
df = pd.read_csv('senti_results_SZ/senti_cleaned_深圳 地铁 201901 1.0.csv')
df.head(5)

Unnamed: 0,id,bid,user_id,用户昵称,微博正文,头条文章url,发布位置,艾特用户,话题,转发数,...,点赞数,发布时间,发布工具,微博图片url,微博视频url,retweet_id,ip,Negative,Neutral,Positive
0,4320000000000000.0,H9Z8kpBtd,5847892183,xieqingheng,赶在末班车回去，挺幸运的#深圳地铁#,,深圳·民治,,深圳地铁,0,...,0,2019/1/1 23:46,iPhone客户端,,,,,0.14275,0.229226,0.917541
1,4320000000000000.0,H9Z6e7pT9,5390310654,蚊子嚎叫,2019年第一天晚，深圳地铁温馨提供凌晨地铁，让我终于今天加班不用再打的回去了,,,,,0,...,1,2019/1/1 23:41,小米8青春版 潮流旗舰,['https://wx3.sinaimg.cn/large/005SNc2qly1fyri...,,,,0.1505,0.353703,0.876308
2,4330000000000000.0,HcRYBm5j4,1900132663,Hey朱zai,在周末回港的最后一刻丢了深圳地铁卡和香港八达通(已经陪伴我快六年的学生卡），以及非常喜欢的卡...,,,,,0,...,0,2019/1/20 23:04,💌iPhone 7 Plus,['https://wx1.sinaimg.cn/large/7141b937gy1fzdg...,,,,0.946389,0.24732,0.194873
3,4330000000000000.0,HcRWx7arv,6365144629,LavenderNights,啊啊啊啊啊啊啊要是深圳地铁applepay可以跟上海一样用交通卡就好了…不想弄银联闪付啊并不...,,,,,0,...,0,2019/1/20 22:59,iPhone客户端,,,,,0.926535,0.276487,0.228433
4,4330000000000000.0,HcIvchoo0,1957317823,Kkkkks2,#顺德智障工程#深圳在维修地铁的时候，会有洒水车经常走过。但是顺德容桂连条铁都没有，却学人家...,,佛山·容桂街区,,顺德智障工程,1,...,2,2019/1/19 22:57,坚果手机 Pro 2,,http://f.us.sinaimg.cn/003SxzFclx07qTBwOnwc010...,,,0.931672,0.274792,0.209922


In [17]:
import pandas as pd
from IPython.display import display, HTML
import os
import glob

# Create directory for output if it doesn't exist
similarity_threshold = 0.75

output_dir = f'similarity_threshold={similarity_threshold}'
os.makedirs(output_dir, exist_ok=True)

# Load the service program CSV file
service_program_df = pd.read_csv('service_program_data/SPD_SZ_zh.csv')

# Create a cache for the source dataframes to avoid reloading them
source_df_cache = {}

# Iterate through each row of the CSV
for index, row in service_program_df.iterrows():
    # Concatenate the first three columns as service_dimension
    service_dimension = ' '.join([str(row[col]) for col in service_program_df.columns[1:3] 
                            if pd.notna(row[col]) and str(row[col]) != 'nan'])
    
    # Search for similar documents in the vector database
    docs = posts_langchain_chroma.similarity_search_with_relevance_scores(service_dimension, k=300, score_threshold=similarity_threshold)
    # Create a dictionary to group matched rows by source file
    matches_by_source = {}
    
    # Process each matched document
    # each doc is a tuple
    for doc in docs:
        source = doc[0].metadata.get('source', None)
        row_index = doc[0].metadata.get('row', None)
        
        if source and row_index is not None:
            # Convert row to integer if it's a numeric string
            try:
                row_index = int(row_index)
            except (ValueError, TypeError):
                continue
                
            if source not in matches_by_source:
                matches_by_source[source] = []
            
            matches_by_source[source].append(row_index)
    
    # Create a list to store all matching rows from original sources
    all_matched_rows = []
    
    # For each source file, get the matching rows
    for source, row_indices in matches_by_source.items():
        # Load the source file if not in cache
        if source not in source_df_cache:
            try:
                source_df_cache[source] = pd.read_csv(source)
            except Exception as e:
                print(f"Error loading {source}: {e}")
                continue
        
        source_df = source_df_cache[source]
        
        # Get rows from original source file
        for row_idx in row_indices:
            try:
                if 0 <= row_idx < len(source_df):
                    row_data = source_df.iloc[row_idx].to_dict()
                    row_data['original_source'] = source
                    row_data['original_row'] = row_idx
                    all_matched_rows.append(row_data)
            except Exception as e:
                print(f"Error accessing row {row_idx} in {source}: {e}")
    
    # Convert to DataFrame
    if all_matched_rows:
        matches_df = pd.DataFrame(all_matched_rows)
        
        # Create a filename for this service program
        safe_filename = f"service_program_{index}_matches.csv"
        
        # Save to CSV
        matches_df.to_csv(os.path.join(output_dir, safe_filename), index=False)
    
    # Print progress
    if index % 10 == 0:
        print(f"Processed {index} rows")

print(f"Completed! All match files saved to {output_dir} directory.")

# 创建摘要文件
summary_file_path = os.path.join(output_dir, '0.matched_summary.txt')
with open(summary_file_path, 'w', encoding='utf-8') as summary_file:
    summary_file.write(f"Created CSV files with matching posts:\n")
    for i, filename in enumerate(sorted(os.listdir(output_dir))):
        if filename == '0.matched_summary.txt':  # 跳过摘要文件本身
            continue
        try:
            match_df = pd.read_csv(os.path.join(output_dir, filename))
            summary_line = f"- {filename}: {len(match_df)} matched original posts\n"
            summary_file.write(summary_line)
            print(summary_line, end='')
        except:
            summary_line = f"- {filename}: Could not read file\n"
            summary_file.write(summary_line)
            print(summary_line, end='')

print(f"\nSummary saved to {summary_file_path}")

No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75


Processed 0 rows


No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75
No relevant docs were retrieved using the relevance score threshold 0.75


Processed 10 rows
Processed 20 rows
Completed! All match files saved to similarity_threshold=0.75 directory.
- service_program_11_matches.csv: 5 matched original posts
- service_program_15_matches.csv: 7 matched original posts
- service_program_16_matches.csv: 300 matched original posts
- service_program_20_matches.csv: 34 matched original posts
- service_program_21_matches.csv: 1 matched original posts
- service_program_22_matches.csv: 167 matched original posts
- service_program_4_matches.csv: 300 matched original posts

Summary saved to similarity_threshold=0.75/0.matched_summary.txt
