In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df = pd.read_csv('/content/drive/Shareddrives/Lit/all reference.csv')
df_cleaned = df.dropna(subset=["text"]).drop_duplicates(subset=["text", "file"]).reset_index(drop=True)

# Chunk text
def chunk_text(text, max_length=500):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks, current_chunk = [], ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

chunked_data = []
for _, row in df_cleaned.iterrows():
    for chunk in chunk_text(row['text']):
        chunked_data.append({"text": chunk, "file": row['file']})

df_chunks = pd.DataFrame(chunked_data)

# Vectorize
vectorizer = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix = vectorizer.fit_transform(df_chunks["text"])

# Search


In [None]:
import pickle

with open("/content/drive/Shareddrives/Lit/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)


In [None]:
import pickle
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

with open("/content/drive/Shareddrives/Lit/tfidf_matrix.pkl", "rb") as f:
    tfidf_matrix = pickle.load(f)
def retrieve_top_sentences(query, top_k=20):
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_sim.argsort()[-top_k:][::-1]
    return [{
        "text": re.sub(r"\s+", " ", df_chunks.iloc[i]["text"]).strip(),
        "metadata": {
            "file": df_chunks.iloc[i]["file"],
            "score": float(cosine_sim[i])
        }
    } for i in top_indices]



In [None]:
# Create a list of dictionaries for DataFrame
data_for_df = []

# Get user input
user_query = input("enter a query: ")

# Call the function with the user input as the query argument
results = retrieve_top_sentences(user_query)

for res in results:
    data_for_df.append({
        "text": res["text"],
        "file": res["metadata"]["file"]
    })

# Create the DataFrame
# Pass the list of dictionaries directly. Pandas will infer columns.
df_results = pd.DataFrame(data_for_df)

# Display the DataFrame (optional)
df_results.head(20)

enter a query: Izanagi Keiki


Unnamed: 0,text,file
0,,超越国家模式：西欧移民的公民融合政策.pdf
1,A research report submitted to the Ministry of...,kim2008.pdf
2,"Cornelius, T . Tsuda, P . L. Martin, and J. F ...",kim2008.pdf
3,Skrentny 2004 “S outh Korea: Importing Undocum...,kim2008.pdf
4,"1982 “International Regimes, T ransactions, an...",kim2008.pdf
5,Policy Convergence in the European Union Becau...,超越国家模式：西欧移民的公民融合政策.pdf
6,"A general feature of these policies, which is ...",超越国家模式：西欧移民的公民融合政策.pdf
7,This inclusiveness is due to the postwar human...,超越国家模式：西欧移民的公民融合政策.pdf
8,"Accordingly, the ﬁrst of the EU’s ‘common basi...",超越国家模式：西欧移民的公民融合政策.pdf
9,"This stance has become a platitude, but one sh...",超越国家模式：西欧移民的公民融合政策.pdf
