In [17]:
import pandas as pd
import numpy as np
import openai
from dotenv import load_dotenv
import os

# Load API Key from .env file and configure OpenAI client
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = api_key

def get_embedding(text, model="text-embedding-3-large"):
    """Get embedding for a given text using OpenAI's embedding model."""
    text = text.replace("\n", " ")
    try:
        response = openai.Embedding.create(
            input=[text],
            model=model
        )
        embedding = response['data'][0]['embedding']
        return np.array(embedding)
    except Exception as e:
        print(f"An error occurred: {e}")
        return np.zeros(1024)  # Assuming the embedding size is 1024

def save_embeddings(queries, output_file='keyword_embeddings.csv'):
    """Fetch and save embeddings for each query."""
    embeddings = []
    for query in queries:
        print(f"Fetching embedding for query: {query}")
        embedding = get_embedding(query)
        embeddings.append([query] + embedding.tolist())
    
    df = pd.DataFrame(embeddings, columns=['query'] + [f'embedding_{i}' for i in range(len(embeddings[0]) - 1)])
    df.to_csv(output_file, index=False)
    return df

def main():
    queries = ["maanviljelijä", "käsityöt", "kalastus", "työläinen", "johtaja", "karjalaseura", "marttaseura", "palkinto", "menestys", "arvostus", "USA", "Ruotsi","sotavanki","menetys","pettymys","tuomio","sotavankina"]
    
    # Fetch and save embeddings for each query
    save_embeddings(queries)

if __name__ == "__main__":
    main()



Fetching embedding for query: maanviljelijä
Fetching embedding for query: käsityöt
Fetching embedding for query: kalastus
Fetching embedding for query: työläinen
Fetching embedding for query: johtaja
Fetching embedding for query: karjalaseura
Fetching embedding for query: marttaseura
Fetching embedding for query: palkinto
Fetching embedding for query: menestys
Fetching embedding for query: arvostus
Fetching embedding for query: USA
Fetching embedding for query: Ruotsi
Fetching embedding for query: sotavanki
Fetching embedding for query: menetys
Fetching embedding for query: pettymys
Fetching embedding for query: tuomio


In [None]:
import pandas as pd
import numpy as np
import openai
from dotenv import load_dotenv
import os

# Load API Key from .env file and configure OpenAI client
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = api_key

def get_embedding(text, model="text-embedding-3-large"):
    """Get embedding for a given text using OpenAI's embedding model."""
    text = text.replace("\n", " ")
    try:
        response = openai.Embedding.create(
            input=[text],
            model=model
        )
        embedding = response['data'][0]['embedding']
        return np.array(embedding)
    except Exception as e:
        print(f"An error occurred: {e}")
        return np.zeros(1024)  # Assuming the embedding size is 1024

def save_embeddings(queries, output_file='keyword_embeddings.csv'):
    """Fetch and save embeddings for each query."""
    if os.path.exists(output_file):
        df_existing = pd.read_csv(output_file)
        existing_queries = set(df_existing['query'])
    else:
        df_existing = pd.DataFrame(columns=['query'] + [f'embedding_{i}' for i in range(1024)])
        existing_queries = set()

    new_embeddings = []
    for query in queries:
        if query in existing_queries:
            print(f"Query '{query}' already exists in the embeddings file. Skipping...")
            continue

        print(f"Fetching embedding for query: {query}")
        embedding = get_embedding(query)
        new_embeddings.append([query] + embedding.tolist())

    if new_embeddings:
        df_new = pd.DataFrame(new_embeddings, columns=['query'] + [f'embedding_{i}' for i in range(1024)])
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        df_combined.to_csv(output_file, index=False)
        return df_combined
    else:
        return df_existing

def main():
    queries = ["maanviljelijä", "käsityöt", "kalastus", "työläinen", "johtaja", "karjalaseura", "marttaseura", "palkinto", "menestys", "arvostus", "USA", "Ruotsi","sotavanki","menetys","pettymys","tuomio","sotavankina"]
    
    # Fetch and save embeddings for each query
    save_embeddings(queries)

if __name__ == "__main__":
    main()

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

def load_embeddings(input_file='keyword_embeddings.csv'):
    """Load embeddings from the CSV file."""
    df = pd.read_csv(input_file)
    embeddings = {row['query']: row.drop('query').values.astype(float) for _, row in df.iterrows()}
    return embeddings

def load_data_chunk(filepath, chunk_size=1000, start_idx=0):
    """Load a chunk of the dataset starting from the given index."""
    df_chunk = pd.read_csv(filepath, skiprows=range(1, start_idx + 1), nrows=chunk_size)
    df_chunk['embedding'] = df_chunk['embedding'].apply(eval).apply(np.array)
    return df_chunk

def calculate_similarity(embeddings, story_embedding, query):
    """Calculate the similarity between a story embedding and a query embedding."""
    query_embedding = embeddings[query]
    similarity = cosine_similarity([story_embedding], [query_embedding])[0][0] * 100  # Convert to percentage
    return similarity

def initialize_results_file(output_file, queries):
    """Initialize the results file with the proper columns."""
    columns = ['index', 'combined_text'] + [f'similarity_{query}' for query in queries]
    df = pd.DataFrame(columns=columns)
    df.to_csv(output_file, index=False)

def calculate_similarities_chunk(df_chunk, embeddings, queries, output_file='search_results.csv'):
    """Calculate similarity scores for a chunk of stories with each query."""
    # Load existing results if available
    if os.path.exists(output_file):
        existing_results = pd.read_csv(output_file)
        if 'index' not in existing_results.columns:
            existing_results = existing_results.reset_index()
    else:
        initialize_results_file(output_file, queries)
        existing_results = pd.read_csv(output_file)

    if 'index' not in df_chunk.columns:
        df_chunk = df_chunk.reset_index()

    new_results = []

    for i in df_chunk.index:
        original_index = df_chunk.loc[i, 'index']
        if original_index in existing_results['index'].values:
            print(f"Skipping story index: {original_index} (already calculated)")
            continue

        story_embedding = df_chunk.loc[i, 'embedding']
        similarities = {'index': original_index, 'combined_text': df_chunk.loc[i, 'combined_text']}
        
        for query in queries:
            similarities[f'similarity_{query}'] = calculate_similarity(embeddings, story_embedding, query)
        
        new_results.append(pd.Series(similarities))

    if new_results:
        new_results_df = pd.DataFrame(new_results)
        existing_results = pd.concat([existing_results, new_results_df], ignore_index=True)

    # Save the updated results
    existing_results.to_csv(output_file, index=False)
    return existing_results

def partial_similarity_calculation(filepath, embeddings, queries, start_idx=0, end_idx=None, chunk_size=1000):
    """Calculate similarities for stories in a specified range using chunk processing."""
    total_stories = sum(1 for _ in open(filepath)) - 1  # Subtract 1 for header
    if end_idx is None or end_idx > total_stories:
        end_idx = total_stories

    for chunk_start in range(start_idx, end_idx, chunk_size):
        chunk_end = min(chunk_start + chunk_size, end_idx)
        print(f"Processing stories from index {chunk_start} to {chunk_end}")
        df_chunk = load_data_chunk(filepath, chunk_size, chunk_start)
        calculate_similarities_chunk(df_chunk, embeddings, queries)

if __name__ == "__main__":
    queries = ["maanviljelijä", "käsityöt", "kalastus", "työläinen", "johtaja", "karjalaseura", "marttaseura", "palkinto", "menestys", "arvostus", "USA", "Ruotsi","sotavanki","menetys","pettymys","tuomio"]
    
    # Load previously saved embeddings
    embeddings = load_embeddings('keyword_embeddings.csv')
    
    # Specify the range of stories to process
    start_idx = 0  # Starting index
    end_idx = 89000  # Ending index, adjust as needed
    
    # Calculate similarities for the specified range using chunk processing
    partial_similarity_calculation('All_Stories_embeddings.csv', embeddings, queries, start_idx, end_idx, chunk_size=1000)



Processing stories from index 0 to 1000


  existing_results = pd.concat([existing_results, new_results_df], ignore_index=True)


Processing stories from index 1000 to 2000
Processing stories from index 2000 to 3000
Processing stories from index 3000 to 4000
Processing stories from index 4000 to 5000
Processing stories from index 5000 to 6000
Processing stories from index 6000 to 7000
Processing stories from index 7000 to 8000
Processing stories from index 8000 to 9000
Processing stories from index 9000 to 10000
Processing stories from index 10000 to 11000
Processing stories from index 11000 to 12000
Processing stories from index 12000 to 13000
Processing stories from index 13000 to 14000
Processing stories from index 14000 to 15000
Processing stories from index 15000 to 16000
Processing stories from index 16000 to 17000
Processing stories from index 17000 to 18000
Processing stories from index 18000 to 19000
Processing stories from index 19000 to 20000
Processing stories from index 20000 to 21000
Processing stories from index 21000 to 22000
Processing stories from index 22000 to 23000
Processing stories from ind

In [4]:
#add keywords
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

def load_embeddings(input_file='keyword_embeddings.csv'):
    """Load embeddings from the CSV file."""
    df = pd.read_csv(input_file)
    embeddings = {row['query']: row.drop('query').values.astype(float) for _, row in df.iterrows()}
    return embeddings

def load_data_chunk(filepath, chunk_size=1000, start_idx=0):
    """Load a chunk of the dataset starting from the given index."""
    df_chunk = pd.read_csv(filepath, skiprows=range(1, start_idx + 1), nrows=chunk_size)
    df_chunk['embedding'] = df_chunk['embedding'].apply(eval).apply(np.array)
    return df_chunk

def calculate_similarity(embeddings, story_embedding, query):
    """Calculate the similarity between a story embedding and a query embedding."""
    query_embedding = embeddings[query]
    similarity = cosine_similarity([story_embedding], [query_embedding])[0][0] * 100  # Convert to percentage
    return similarity

def update_results_with_new_keywords(output_file, embeddings, new_keywords):
    """Update the results file with new keyword similarities."""
    if not os.path.exists(output_file):
        raise FileNotFoundError(f"The results file {output_file} does not exist.")
    
    existing_results = pd.read_csv(output_file)
    
    if 'index' not in existing_results.columns:
        existing_results = existing_results.reset_index()

    for keyword in new_keywords:
        if f'similarity_{keyword}' in existing_results.columns:
            print(f"Keyword '{keyword}' already exists in the results file. Skipping...")
            continue

        print(f"Calculating similarities for new keyword: {keyword}")
        existing_results[f'similarity_{keyword}'] = existing_results.apply(
            lambda row: calculate_similarity(embeddings, eval(row['embedding']), keyword), axis=1
        )

    existing_results.to_csv(output_file, index=False)
    return existing_results

def main():
    new_queries = ["uusi_sana1", "uusi_sana2"]  # Add new keywords here

    # Load previously saved embeddings
    embeddings = load_embeddings('keyword_embeddings.csv')

    # Check if new queries exist in embeddings
    missing_queries = [query for query in new_queries if query not in embeddings]
    if missing_queries:
        raise ValueError(f"The following new queries are missing in the embeddings file: {missing_queries}")

    # Update results file with new keywords
    updated_results = update_results_with_new_keywords('search_results.csv', embeddings, new_queries)
    print("Updated results with new keywords.")

if __name__ == "__main__":
    main()

ValueError: The following new queries are missing in the embeddings file: ['uusi_sana1', 'uusi_sana2']

In [1]:
import pandas as pd
import numpy as np

# Function to wrap text
def wrap_text(text, width):
    """Wrap text with a given width."""
    return '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])

# Function to prepare the results dataframe
def prepare_results_dataframe(results_df):
    # Define the queries for each category
    job_hobbies_queries = ["sotavanki"]
    achievements_social_activities_queries = ["tuomio"]

    # Ensure only specified similarities are used for calculations
    results_df['job_hobbies_similarity'] = results_df.apply(
        lambda row: np.mean([row[f'similarity_{q}'] for q in job_hobbies_queries if f'similarity_{q}' in row]), axis=1
    )
    results_df['achievements_similarity'] = results_df.apply(
        lambda row: np.mean([row[f'similarity_{q}'] for q in achievements_social_activities_queries if f'similarity_{q}' in row]), axis=1
    )

    # Check for NaN values and replace them with 0
    results_df['job_hobbies_similarity'] = results_df['job_hobbies_similarity'].fillna(0)
    results_df['achievements_similarity'] = results_df['achievements_similarity'].fillna(0)

    return results_df[['index', 'combined_text', 'job_hobbies_similarity', 'achievements_similarity']]

# Load the results
results_df = pd.read_csv('search_results.csv')

# Prepare the results dataframe
prepared_results_df = prepare_results_dataframe(results_df)

# Save the prepared results to a CSV file
output_csv_path = 'prepared_search_results.csv'
prepared_results_df.to_csv(output_csv_path, index=False)

print(f"CSV file saved to {output_csv_path}")


CSV file saved to prepared_search_results.csv


In [3]:
#kmeans clusters
import pandas as pd
import plotly.express as px
import os
from sklearn.cluster import KMeans

def wrap_text(text, width):
    """Wrap text with a given width."""
    return '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])

def visualize_results(file_path, output_dir, n_clusters=100):
    # Load the results from the CSV file
    results_df = pd.read_csv(file_path)
    
    # Verify the columns in the dataframe
    required_columns = {'index', 'combined_text', 'job_hobbies_similarity', 'achievements_similarity'}
    if not required_columns.issubset(results_df.columns):
        raise ValueError("The CSV file does not contain the required columns.")
    
    # Ensure non-negative values for the 'size' parameter
    results_df['achievements_similarity_size'] = results_df['achievements_similarity'].apply(lambda x: max(x, 0))
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    results_df['cluster'] = kmeans.fit_predict(results_df[['job_hobbies_similarity', 'achievements_similarity']])

    # Create hover text with index, wrapped story text, job hobbies similarity, and social activities similarity
    results_df['hover_text'] = results_df.apply(
        lambda row: (f"<b> Index:</b> {row['index']}<br>"
                     f"<b> Story:</b> {wrap_text(row['combined_text'], 80)}<br>"
                     f"<b> Job/Hobbies Similarity:</b> {row['job_hobbies_similarity']:.2f}%<br>"
                     f"<b> Social Activities Similarity:</b> {row['achievements_similarity']:.2f}%<br>"
                     f"<b> Cluster:</b> {row['cluster']}"), 
        axis=1
    )

    # Use the similarity scores for plotting
    fig = px.scatter(
        results_df,
        x='achievements_similarity',
        y='job_hobbies_similarity',
        color='cluster',
        size='achievements_similarity_size',
        hover_data=['index', 'combined_text'],
        title='Semantic Search Results',
        labels={
            'achievements_similarity': 'Achievements/Social Activities Similarity (%)',
            'job_hobbies_similarity': 'Job/Hobbies Similarity (%)'
        },
        height=600
    )

    # Update the hovertemplate to use the custom hover text and display as a block element
    fig.update_traces(
        marker=dict(size=8), 
        hovertemplate="<div style='white-space:normal; width:300px;'>%{customdata[0]}<extra></extra></div>",
        customdata=results_df[['hover_text']].values
    )

    fig.update_layout(
        title='Semantic Search Results',
        xaxis_title='Achievements/Social Activities Similarity (%)',
        yaxis_title='Job/Hobbies Similarity (%)',
        hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell", bordercolor="black"),
        xaxis=dict(range=[0, 100]),  # Ensure x-axis range is consistent
        yaxis=dict(range=[0, 100])   # Ensure y-axis range is consistent
    )

    # Save the figure as an HTML file
    html_file = os.path.join(output_dir, 'semantic_search_results_sotavanki_tuomio.html')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    fig.write_html(html_file)
    
    # Add custom JavaScript for copying to clipboard
    with open(html_file, 'a') as f:
        f.write("""
<script>
    document.addEventListener('DOMContentLoaded', function() {
        var plot = document.querySelector('.plotly-graph-div');
        plot.on('plotly_click', function(data) {
            var infotext = data.points.map(function(d) {
                return d.customdata[0].replace(/<[^>]+>/g, '');  // Remove HTML tags for clean clipboard content
            });
            copyToClipboard(infotext.join('\\n\\n'));
        });
    });

    function copyToClipboard(text) {
        var el = document.createElement('textarea');
        el.value = text;
        document.body.appendChild(el);
        el.select();
        document.execCommand('copy');
        document.body.removeChild(el);
        var notification = document.createElement('div');
        notification.innerHTML = 'Copied to clipboard';
        notification.style.position = 'fixed';
        notification.style.bottom = '10px';
        notification.style.left = '10px';
        notification.style.padding = '10px';
        notification.style.backgroundColor = '#5cb85c';
        notification.style.color = 'white';
        notification.style.borderRadius = '5px';
        document.body.appendChild(notification);
        setTimeout(function() {
            document.body.removeChild(notification);
        }, 2000);
    }
</script>
        """)

# File path to the CSV file
file_path = 'prepared_search_results.csv'

# Directory to save the visualization
output_dir = 'semantic_search_visualizations'

# Visualize results with K-means clustering
visualize_results(file_path, output_dir, n_clusters=100)


In [8]:
import pandas as pd
import plotly.express as px
import os

def wrap_text(text, width):
    """Wrap text with a given width."""
    return '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])

def filter_stories_by_length(df, min_length=600):
    """Filter out stories with fewer than min_length characters."""
    original_count = len(df)
    filtered_df = df[df['combined_text'].str.len() >= min_length]
    excluded_count = original_count - len(filtered_df)
    return filtered_df, excluded_count

def visualize_results(file_path, output_dir):
    # Load the results from the CSV file
    results_df = pd.read_csv(file_path)
    
    # Verify the columns in the dataframe
    required_columns = {'index', 'combined_text', 'job_hobbies_similarity', 'achievements_similarity'}
    if not required_columns.issubset(results_df.columns):
        raise ValueError("The CSV file does not contain the required columns.")
    
    # Filter out stories with fewer than 200 characters
    results_df, excluded_count = filter_stories_by_length(results_df, min_length=600)

    # Ensure non-negative values for the 'size' parameter
    results_df['achievements_similarity_size'] = results_df['achievements_similarity'].apply(lambda x: max(x, 0))

    # Create hover text with index and wrapped story text
    results_df['hover_text'] = results_df.apply(
        lambda row: (f"<b>Index:</b> {row['index']}<br>"
                     f"<b>Story:</b> {wrap_text(row['combined_text'], 80)}<br>"
                     f"<b>Job/Hobbies Similarity:</b> {row['job_hobbies_similarity']:.2f}%<br>"
                     f"<b>Achievements Similarity:</b> {row['achievements_similarity']:.2f}%"), 
        axis=1
    )

    # Use the similarity scores for plotting
    fig = px.scatter(
        results_df,
        x='achievements_similarity',
        y='job_hobbies_similarity',
        size='achievements_similarity_size',
        hover_data=['index', 'combined_text'],
        title='Semantic Search Results',
        labels={
            'achievements_similarity': 'Achievements/Social Activities Similarity (%)',
            'job_hobbies_similarity': 'Job/Hobbies Similarity (%)'
        },
        height=600
    )

    # Update the hovertemplate to use the custom hover text and display as a block element
    fig.update_traces(
        marker=dict(size=8), 
        hovertemplate="<div style='white-space:normal; width:300px;'>%{customdata[0]}<extra></extra></div>",
        customdata=results_df[['hover_text']].values
    )

    fig.update_layout(
        title='Semantic Search Results',
        xaxis_title='Achievements/Social Activities Similarity (%)',
        yaxis_title='Job/Hobbies Similarity (%)',
        hoverlabel=dict(bgcolor="white", font_size=16, font_family="Rockwell", bordercolor="black"),
        xaxis=dict(range=[0, 100]),  # Ensure x-axis range is consistent
        yaxis=dict(range=[0, 100])   # Ensure y-axis range is consistent
    )

    # Save the figure as an HTML file
    html_file = os.path.join(output_dir, 'semantic_search_results_filtered.html')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    fig.write_html(html_file)
    
    # Add custom JavaScript for copying to clipboard
    with open(html_file, 'a') as f:
        f.write("""
<script>
    document.addEventListener('DOMContentLoaded', function() {
        var plot = document.querySelector('.plotly-graph-div');
        plot.on('plotly_click', function(data) {
            var infotext = data.points.map(function(d) {
                return d.customdata[0].replace(/<[^>]+>/g, '');  // Remove HTML tags for clean clipboard content
            });
            copyToClipboard(infotext.join('\\n\\n'));
        });
    });

    function copyToClipboard(text) {
        var el = document.createElement('textarea');
        el.value = text;
        document.body.appendChild(el);
        el.select();
        document.execCommand('copy');
        document.body.removeChild(el);
        var notification = document.createElement('div');
        notification.innerHTML = 'Copied to clipboard';
        notification.style.position = 'fixed';
        notification.style.bottom = '10px';
        notification.style.left = '10px';
        notification.style.padding = '10px';
        notification.style.backgroundColor = '#5cb85c';
        notification.style.color = 'white';
        notification.style.borderRadius = '5px';
        document.body.appendChild(notification);
        setTimeout(function() {
            document.body.removeChild(notification);
        }, 2000);
    }
</script>
        """)

    print(f"Number of stories excluded: {excluded_count}")

# File path to the CSV file
file_path = 'prepared_search_results.csv'

# Directory to save the visualization
output_dir = 'semantic_search_visualizations'

# Visualize results based on story length
visualize_results(file_path, output_dir)


Number of stories excluded: 41599
