In [None]:
from crewai import LLM
import os
from dotenv import load_dotenv
load_dotenv()


os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY")
os.environ["CEREBRAS_API_KEY"] = os.getenv("CEREBRAS_API_KEY")
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["GOOGLE_AI_STUDIO_API_KEY"] = os.getenv("GOOGLE_AI_STUDIO_API_KEY")
os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
os.environ["ANTHROPIC_API_KEY"] = os.getenv("ANTHROPIC_API_KEY")

Gemini = LLM(
        model='gemini/gemini-2.0-flash-exp',
        api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
        temperature=0,
        max_tokens=1024,
    )
Mixtral = LLM(model="groq/mixtral-8x7b-32768",
    temperature=0.7,
    max_tokens=1024,
    api_key=os.getenv("GROQ_API_KEY")
)
QwQ =  LLM(
    model="huggingface/Qwen/QwQ-32B-Preview",
    base_url="https://api-inference.huggingface.co/models/Qwen/QwQ-32B-Preview",
    temperature=0.7,
    max_tokens=1024,
    api_key=os.getenv("HF_TOKEN")
)

Llama = LLM(
    model="cerebras/llama3.3-70b",
    temperature=0.7,
    max_tokens=8192,
    api_key=os.getenv("CEREBRAS_API_KEY")
)

In [2]:
from crewai_tools import (
    SerplyWebSearchTool,CodeDocsSearchTool,CSVSearchTool,DirectorySearchTool,MDXSearchTool
)
from dotenv import load_dotenv
load_dotenv()
os.environ["SERPLY_API_KEY"] = os.getenv("SERPLY_API_KEY")
search_tool = SerplyWebSearchTool()
code_search_tool = CodeDocsSearchTool()
directory_search_tool = DirectorySearchTool("generative_ai_project/data")
Markdown_search_tool = MDXSearchTool()

Inserting batches in chromadb:   0%|          | 0/1 [00:03<?, ?it/s]


In [43]:
from crewai import (
    Agent,Crew,Task
    )

Competition_overview_agent = Agent(
    role="Kaggle Competition Overview Agent",
    goal="Analyze the competition's rules, overview, description, evaluation metrics, prizes, timeline, and also take note of the competition's submission.csv file schema , visit {competition_url}",
    backstory="""You’re a seasoned data scientist and Kaggle Grandmaster, celebrated for your unmatched expertise in analyzing and navigating competition landscapes. Over the years, you’ve fine-tuned your ability to extract critical insights from complex competition rules and datasets, ensuring no detail escapes your scrutiny. Your precision in interpreting submission schemas and evaluation metrics has helped teams achieve top leaderboard positions. With a blend of analytical prowess and a passion for empowering data teams, you’re the architect of project success, laying a strong foundation for innovation and strategy.""",
    allow_delegation=True,
    llm = Gemini,
    verbose = True,
    tools=[search_tool,directory_search_tool]
)
Data_Preprocessing_agent = Agent(
    role="Data Preprocessing Agent",
    goal="You navigate through all the records of the dataset {dataset} to look for null values, inconsistent data formats, check for outliers, drop duplicates, apply necessary imputation, and ensure effective conversion of nominal or ordinal columns into numerical format using different encoding techniques.",
    backstory="""You’re a masterful data wrangler with a reputation for turning chaotic, raw datasets into analytical masterpieces. Drawing from a vast repertoire of experiences across industries, you excel at detecting and resolving data anomalies, ensuring datasets meet the highest standards of quality and consistency. Your ability to harmonize data—be it through handling null values, encoding categorical variables, or addressing outliers—forms the backbone of reliable machine learning models. Known for your diligence and innovative solutions, you approach every dataset with the mindset of crafting a flawless input for analytical success.""",
    llm = Llama,
    verbose = True,
    allow_delegation=True,
    allow_code_execution = True,
    max_iter=20,  
    max_retries = 5,
    max_execution_time=600, 
    memory=True,
    cache = True,
    respect_context_window=True,
    code_execution_mode="safe", 
    tools=[Markdown_search_tool,directory_search_tool]
)

modeling_agent = Agent(
    role = "Machine Learning Engineer",
    goal = "Evaluate the dataset, identify the nature of the target variable, address class imbalance (using techniques like SMOTE), perform stratified cross-validation, and train the best model based on performance.",
    backstory = """As a seasoned machine learning engineer with over a decade of hands-on experience, you have a reputation for solving complex predictive challenges across domains. From fine-tuning cutting-edge models to addressing class imbalances with techniques like SMOTE, your work has consistently delivered state-of-the-art results. You excel at building pipelines that integrate seamlessly with production systems, employing rigorous cross-validation to ensure robust and reproducible performance. Fueled by an insatiable curiosity and a drive for continuous learning, you’ve become a trusted expert in transforming data into actionable insights through machine learning excellence.""",
    llm = QwQ,
    verbose = True,
    max_iter=20,  
    max_retries = 5,
    max_execution_time=600, 
    memory = True,
    allow_delegation=True,
    allow_code_execution=True,
    cache = True, 
    )
submission_agent = Agent(
    role="Submission Specialist",
    goal="Generate the submission.csv file based on the trained model's predictions, ensuring it adheres to the format specified in the competition guidelines.",
    backstory="""A meticulous data handler with expertise in creating accurate and well-formatted submission files for Kaggle data science competitions. With years of experience, you ensure every submission follows the kaggle competition’s strict guidelines, eliminating any errors and guaranteeing it meets the quality required to win. You combine precision and consistency to prepare submission files that showcase the results of complex models with the highest standards of accuracy.""",
    llm=Mixtral, 
    verbose = True,
    max_itr = 20,
    max_retries = 5,
    allow_delegation=True,
    allow_code_execution=True,
    cache=True,
)

In [44]:
from crewai import Task

competition_overview_task = Task(
    description="Analyze the Kaggle competition's overview, description, evaluation metric, submission format, and guidelines.",
    expected_output="Detailed analysis of competition rules, overview, description, evaluation, and submission schema in a markdown format",
    output_file="Competition_overview.md",
    agent=Competition_overview_agent,
)

data_preprocessing_task = Task(
    description=(
        "Perform thorough data preprocessing on the dataset to ensure it is clean, consistent, and ready for modeling. "
        "This includes detecting and handling null or missing values using appropriate imputation techniques, such as mean, "
        "median, or mode imputation, or advanced methods like KNN imputation. The task also involves identifying and treating "
        "outliers using methods like the IQR method or Z-score analysis, ensuring that the data distribution remains valid and "
        "non-skewed. Duplicate records will be detected and removed to ensure data integrity. Inconsistent data formats will "
        "be addressed, ensuring that numerical columns are properly formatted and that categorical variables are correctly "
        "encoded using techniques such as one-hot encoding, label encoding, or ordinal encoding based on the nature of the data. "
        "This task ensures that the dataset is fully preprocessed, cleaned, and transformed, enabling it to be used in model training "
        "and analysis with no further data issues."
    ),
    expected_output=(
        "A cleaned and preprocessed dataset with missing values imputed, outliers removed or adjusted, duplicates eliminated, "
        "and categorical variables properly encoded. The dataset will be ready for further analysis or model training, with all "
        "data integrity issues resolved."
    ),
    agent=Data_Preprocessing_agent,
)


modeling_task = Task(
    description=(
        "Train a variety of machine learning models on the preprocessed dataset. The task involves selecting "
        "appropriate algorithms for the problem, including but not limited to decision trees, random forests, "
        "support vector machines, and gradient boosting models. The task also addresses class imbalance issues "
        "by applying Synthetic Minority Over-sampling Technique (SMOTE) to generate synthetic samples for the "
        "underrepresented class. Furthermore, stratified cross-validation will be employed to ensure that the "
        "evaluation of model performance is robust, with equal representation of classes in each fold. The goal is "
        "to determine the best-performing model based on key performance metrics such as accuracy, precision, recall, "
        "F1-score, and area under the ROC curve (AUC). The best model will be selected and saved as a serialized "
        "model file for future use in prediction tasks."
    ),
    expected_output=(
        "A trained machine learning model, evaluated using multiple metrics including accuracy, precision, recall, "
        "F1-score, and AUC. The final model will be the one with the highest performance metrics, and it will be "
        "saved as 'MLmodel.pkl'. The output will also include a summary of performance metrics for each model tested."
    ),
    output_file="MLmodel.pkl",
    agent=modeling_agent
)

submission_task = Task(
    description="Generate the submission.csv file based on the trained model's predictions.",
    expected_output="submission.csv file adhering to the competition's format.",
    output_file = "submission.csv",
    agent=submission_agent
)

In [45]:
from crewai import Process
crew = Crew(
    agents=[Competition_overview_agent,Data_Preprocessing_agent,modeling_agent,submission_agent],
    tasks=[competition_overview_task,data_preprocessing_task,modeling_task,submission_task],
    memory=True,
    verbose=True,
    process = Process.sequential,
)



In [53]:
import pandas as pd
df = pd.read_csv(r'C:\Users\Raghu\Downloads\GenAITemplate\generative_ai_project\data\train.csv')

kaggle_inputs = {
    'competition_url':'https://www.kaggle.com/competitions/playground-series-s4e12',
    'dataset':df.head(len(df)).to_json(orient='records')
}

In [55]:
from IPython.display import Markdown,display

result = crew.kickoff(inputs=kaggle_inputs)
display(Markdown(result.raw))

2024-12-25 20:46:35,520 - 8492 - rag_storage.py-rag_storage:119 - ERROR: Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
2024-12-25 20:46:37,858 - 8492 - rag_storage.py-rag_storage:119 - ERROR: Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'


[1m[95m# Agent:[00m [1m[92mKaggle Competition Overview Agent[00m
[95m## Task:[00m [92mAnalyze the Kaggle competition's overview, description, evaluation metric, submission format, and guidelines.[00m


[1m[95m# Agent:[00m [1m[92mKaggle Competition Overview Agent[00m
[95m## Using tool:[00m [92mGoogle Search[00m
[95m## Tool Input:[00m [92m
"{\"search_query\": \"Kaggle Playground Series Season 4, Episode 12\"}"[00m
[95m## Tool Output:[00m [92m

Search results: Title: Regression with an Insurance Dataset
Link: https://www.kaggle.com/competitions/playground-series-s4e12
Description: The goal of the Tabular Playground Series is to provide the Kaggle community with a variety of fairly light-weight challenges that can be used to learn and ...
---
Title: Leaderboard - Regression with an Insurance Dataset
Link: https://www.kaggle.com/competitions/playground-series-s4e12/leaderboard
Description: Regression with an Insurance Dataset. Playground Series - Season 4, Episode

2024-12-25 20:47:02,937 - 8492 - rag_storage.py-rag_storage:119 - ERROR: Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
2024-12-25 20:47:06,576 - 8492 - rag_storage.py-rag_storage:119 - ERROR: Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'


[1m[95m# Agent:[00m [1m[92mKaggle Competition Overview Agent[00m
[95m## Task:[00m [92mAnalyze the Kaggle competition's overview, description, evaluation metric, submission format, and guidelines.[00m


[1m[95m# Agent:[00m [1m[92mKaggle Competition Overview Agent[00m
[95m## Using tool:[00m [92mGoogle Search[00m
[95m## Tool Input:[00m [92m
"{\"search_query\": \"Kaggle Playground Series Season 4, Episode 12\"}"[00m
[95m## Tool Output:[00m [92m

Search results: Title: Regression with an Insurance Dataset
Link: https://www.kaggle.com/competitions/playground-series-s4e12
Description: The goal of the Tabular Playground Series is to provide the Kaggle community with a variety of fairly light-weight challenges that can be used to learn and ...
---
Title: Leaderboard - Regression with an Insurance Dataset
Link: https://www.kaggle.com/competitions/playground-series-s4e12/leaderboard
Description: Regression with an Insurance Dataset. Playground Series - Season 4, Episode

2024-12-25 20:47:43,752 - 8492 - rag_storage.py-rag_storage:90 - ERROR: Error during short_term save: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Fee

2024-12-25 20:48:25,249 - 8492 - rag_storage.py-rag_storage:119 - ERROR: Error during short_term search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'
2024-12-25 20:48:28,808 - 8492 - rag_storage.py-rag_storage:119 - ERROR: Error during entities search: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'


[1m[95m# Agent:[00m [1m[92mData Preprocessing Agent[00m
[95m## Task:[00m [92mPerform thorough data preprocessing on the dataset to ensure it is clean, consistent, and ready for modeling. This includes detecting and handling null or missing values using appropriate imputation techniques, such as mean, median, or mode imputation, or advanced methods like KNN imputation. The task also involves identifying and treating outliers using methods like the IQR method or Z-score analysis, ensuring that the data distribution remains valid and non-skewed. Duplicate records will be detected and removed to ensure data integrity. Inconsistent data formats will be addressed, ensuring that numerical columns are properly formatted and that categorical variables are correctly encoded using techniques such as one-hot encoding, label encoding, or ordinal encoding based on the nature of the data. This task ensures that the dataset is fully preprocessed, cleaned, and transformed, enabling it to be us

2024-12-25 21:02:10,853 - 8492 - llm.py-llm:170 - ERROR: LiteLLM call failed: litellm.Timeout: APITimeoutError - Request timed out. 
error_str: Request timed out.


Timeout: litellm.Timeout: APITimeoutError - Request timed out. 
error_str: Request timed out.