In [None]:
!pip install streamlit langchain sentence-transformers faiss-cpu transformers
!pip install --upgrade langchain langchain-community openai faiss-cpu tiktoken huggingface_hub

In [13]:
# from google.colab import files
# import io
import pandas as pd

# uploaded = files.upload()
# filename, content = next(iter(uploaded.items()))
#df = pd.read_csv(io.StringIO(content.decode('utf-8')), delimiter='\t')

df= pd.read_csv("/content/fifa_eda.csv")
#print(f"Successfully loaded {filename}")
df.head()


Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Preferred Foot,International Reputation,Skill Moves,Position,Joined,Contract Valid Until,Height,Weight,Release Clause
0,158023,L. Messi,31,Argentina,94,94,FC Barcelona,110500.0,565.0,Left,5.0,4.0,RF,2004,2021-01-01,5.583333,159.0,226500.0
1,20801,Cristiano Ronaldo,33,Portugal,94,94,Juventus,77000.0,405.0,Right,5.0,5.0,ST,2018,2022-01-01,6.166667,183.0,127100.0
2,190871,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,118500.0,290.0,Right,5.0,5.0,LW,2017,2022-01-01,5.75,150.0,228100.0
3,193080,De Gea,27,Spain,91,93,Manchester United,72000.0,260.0,Right,4.0,1.0,GK,2011,2020-01-01,6.333333,168.0,138600.0
4,192985,K. De Bruyne,27,Belgium,91,92,Manchester City,102000.0,355.0,Right,4.0,4.0,RCM,2015,2023-01-01,5.916667,154.0,196400.0


In [14]:
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

def setup_rag_pipeline(df: pd.DataFrame):
    """
    This function processes a given DataFrame to combine specific text columns,
    creates a document loader from the data, embeds the documents using a sentence-transformer model,
    and initializes a RetrievalQA chain with a transformer-based LLM.

    Args:
        df (pd.DataFrame): The input DataFrame containing columns related to players (e.g., 'Name', 'Club').

    Returns:
        RetrievalQA: A LangChain RetrievalQA object that can be used to query the data using natural language.
                     Returns None if the required text column combination fails.

    Steps:
    - Combine relevant columns into a single text column for processing.
    - Load data into LangChain using a DataFrameLoader.
    - Generate embeddings using a HuggingFace sentence transformer model.
    - Store the embeddings in a FAISS vectorstore and create a retriever.
    - Set up a text generation pipeline using the `google/flan-t5-base` model.
    - Return a LangChain RetrievalQA chain for querying.
    """
    # Define columns to combine into one text field
    text_columns_to_combine = ['Name', 'Club', 'Nationality', 'Position', 'Preferred Foot']
    combined_column_name = 'combined_text'

    # Combine the selected columns into one column for text-based processing
    df[combined_column_name] = df[text_columns_to_combine].astype(str).agg(' '.join, axis=1)

    # Check that the combined column was successfully created
    if combined_column_name not in df.columns:
        print(f"Error: Failed to create combined text column '{combined_column_name}'.")
        return None

    # Load the documents using LangChain's DataFrameLoader
    loader = DataFrameLoader(df, page_content_column=combined_column_name)
    documents = loader.load()

    # Create sentence embeddings using HuggingFace's "all-MiniLM-L6-v2" model
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    # Build a FAISS vector store from the embedded documents
    vectorstore = FAISS.from_documents(documents, embeddings)
    retriever = vectorstore.as_retriever()

    # Set up a text generation pipeline with the FLAN-T5 model
    text_gen = pipeline("text2text-generation", model="google/flan-t5-base", max_length=100)

    # Wrap the pipeline in a LangChain-compatible LLM object
    llm = HuggingFacePipeline(pipeline=text_gen)

    # Create the final RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

    return qa_chain


In [16]:
def chatBot(df):
    rag_chain = setup_rag_pipeline(df)
    print("Chatbot is ready! Type 'exit' to stop.")
    while True:
        query = input("\nYou: ")
        if query.lower() == 'exit':
            break
        response = rag_chain.run(query)
        print("Bot:", response)

chatBot(df)

Device set to use cuda:0


Chatbot is ready! Type 'exit' to stop.

You: who is the best player
Bot: Luisinho Al Faisaly Brazil LM Right David Raya Blackburn Rovers Spain GK Right Rafael Sampdoria Brazil GK Right Wiliam Santinho Fluminense Brazil GK Right

You: give me only one best player
Bot: David Luiz

You: why is david luiz best player
Bot: he is a good player

You: for how much did the player acuire for
Bot: a year

You: what was the money Ronaldo received
Bot: a stipend

You: how much money did the player received
Bot: a million dollars

You: who was the player who recevied million dolor
Bot: Marcos Llorente

You: exit
