# Installing Required Libraries


In [None]:
!pip install -qU gdown glob pandas numpy sentence-transformers scikit-learn

In [None]:
import pandas as pd
import numpy as np
import os
import json
import gdown
import glob
from google.colab import drive
from google import genai
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Downloading Dataset 1


In [None]:
os.environ["GOOGLE_API_KEY"] = ""
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = ""

drive.mount('/content/drive')

In [None]:
client = genai.Client()

In [None]:
file_directory = "https://drive.google.com/drive/folders/1ZlsE-DXR0noMDnyCt83oOhyarSuCnZvC?usp=share_link"
gdown.download_folder(file_directory, quiet=False, use_cookies=False)

doc_paths = []
for filepath in glob.glob(os.path.join("/content/Web Scraped Solder Bridging/", "*")):
    doc_paths.append(filepath)

print(doc_paths)

# Process and Failure Mode Extraction


In [None]:
def get_response(filename):
    sample_file = client.files.upload(
        file=filename,
    )

    # This prompt instructs the model to:
    # 1) Identify which of the 4 processes the PDF is about.
    # 2) Summarize structure, key themes, specialized terms, repeated points.
    # 3) List important sub-defects for that identified process.
    # 4) Provide relevant references or citations from the PDF.
    # 5) Return it all in JSON format.
    modified_prompt = """
Task:
Perform an in-depth analysis of the attached PDF and return only a JSON array in the exact syntax shown below.

Steps to follow
	1.	Determine which of the four SMT processes the document primarily addresses:
    • Solder Paste Printing
    • Component Placement
    • Reflow Soldering
    • Rework
	2.	Identify every sub-defect that the document discusses within the context of that process.

Output format
Return a single JSON array of 2-tuples:
json:
[
  ("<Identified_Process>", "<Sub_Defect>"),
  ("<Identified_Process>", "<Sub_Defect>"),
  ...
]

Rules for the array
	•	Each <Identified_Process> must be one of the four processes listed above—spelled exactly as shown.
	•	A given <Sub_Defect> may appear once and only once in the entire array. The same defect must not be assigned to multiple processes.
	•	Preserve the tuple order as (Process, Sub-Defect).

Important:
• Return only the JSON array—no explanations, no extra text.
• Ensure strict compliance with the uniqueness rule before replying.
  """

    # Send the modified prompt to the model, along with the uploaded file
    response = client.models.generate_content(
        model="gemini-1.5-pro", contents=[sample_file, modified_prompt]
    )

    print(response.text, "\n\n\n\n\n\n")

    resp = "[" + response.text.split("[")[1].split("]")[0] + "]"
    resp = resp.replace("(", "[").replace(")", "]")
    resp = json.loads(resp)

    print(resp)

    return resp

In [None]:
full_text = []

for i in doc_paths:
  response = get_response(i)
  full_text.append(response)

In [None]:
# Instead of have a seperate list of pairs for each doc, have just one list with pairs from all docs
full_text_single_list = []

for doc_list in full_text:
    for pair in doc_list:
        full_text_single_list.append(pair)

print(f"Number of (Process -> Failure_Mode) pairs: {len(full_text_single_list)}")
print(f"Pairs: {full_text_single_list}")

# Removing Duplicates

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def unify_pairs_cosine(pairs, threshold=0.85):
    """
    Unifies near-duplicate (process, mode) pairs by embedding and comparing
    with cosine similarity.

    :param pairs: list of [process, mode] or (process, mode)
    :param threshold: float in [0..1]. A higher threshold means a stricter match.

    :return: A list of unique (process, mode) pairs.
    """
    unique_pairs = []
    unique_embeddings = []

    for process, mode in pairs:
        # Combine process and mode into a single string for embedding
        # You could do something more sophisticated, e.g. embedding them separately.
        combined_text = f"{process} {mode}"
        vector = model.encode(combined_text)

        is_duplicate = False
        for i, existing_pair in enumerate(unique_pairs):
            # Compare to the embedding of the existing pair
            sim = cosine_similarity([vector], [unique_embeddings[i]])[0][0]
            if sim >= threshold:
                is_duplicate = True
                break

        if not is_duplicate:
            # This pair is new enough to include
            unique_pairs.append((process, mode))
            unique_embeddings.append(vector)

    return unique_pairs


# Example usage:
pairs = [
    ["Reflow Soldering", "Solder Balls"],
    [
        "reflow soldering",
        "Solder Balling/Spattering",
    ],  # very similar => might be merged
    ["Reflow Soldering", "Bridging"],
    ["Reflow Soldering", "Bridge"],
]

# unique_list = unify_pairs_cosine(full_text, threshold=0.80)
unique_list = unify_pairs_cosine(full_text_single_list, threshold=0.80)

print(f"Number of Process->Failure_mode pairs in unique_list: {len(unique_list)}")
print(f"Pairs: {unique_list}")
# E.g. might output:
# [('Reflow Soldering', 'Tomb-stoning'),
#  ('Reflow Soldering', 'Bridging')]

# Creating the DataFrame

In [None]:
fmea_columns = ['SMT Process', 'Failure Mode', 'Potential Effects', 'Potential Causes', 'Recommended Actions', 'Effects_Attr', 'Causes_Attr', 'Recommended_Actions_Attr']

fmea_df = pd.DataFrame(unique_list, columns=fmea_columns[:2]).reindex(columns=fmea_columns, fill_value='')

In [None]:
fmea_df.head()

In [None]:
pd.to_csv("FMEA_with_Processes_and_FMs.csv", index=False)