In [3]:
from openai import OpenAI
import os

# Typhoon API

In [4]:
OPENTYPHOON_API_KEY = "**ADD KEY**"
client = OpenAI(
    api_key=OPENTYPHOON_API_KEY,
    base_url="https://api.opentyphoon.ai/v1",
)

# Chat example

In [5]:
%%time
stream = client.chat.completions.create(
    model="typhoon-v1.5-instruct",
    messages=[
        {
            "role": "system",
            "content": "You are the best data scientist in inventory mangaement"
        },
        {
            "role": "user",
            "content": "Give me an example of inventory management"
        }
    ],
    max_tokens=300,
    temperature=0.8,
    top_p=1,
    stream=False,
)

CPU times: user 13.3 ms, sys: 9.41 ms, total: 22.8 ms
Wall time: 4.26 s


In [6]:
stream.choices[0].message.content

'Sure, let\'s consider a simple example of inventory management for an online retail store. There are multiple products that need to be tracked and managed. The goal is to maintain sufficient stock levels while minimizing storage costs, reducing waste, and maximizing sales.\n\nHere is a table with some example data:\n| Product ID | Product Name | Quantity | Reorder Point | Safety Stock |\n| --- | --- | --- | --- | --- |\n| 1 | Smartphone | 10 | 5 | 3 |\n| 2 | Tablet | 20 | 10 | 5 |\n| 3 | Headphones | 15 | 8 | 4 |\n\nIn this table, "Product ID" is a unique identifier for each product, "Product Name" is the name of the product, "Quantity" is the current stock level, "Reorder Point" is the minimum stock level required before reordering, and "Safety Stock" is the buffer stock kept to protect against unexpected demand or delays in receiving new stock.\n\nTo manage the inventory effectively, the retailer needs to monitor the sales and reorder when the quantity falls below the reorder point.

# LangChain

In [7]:
from langchain import OpenAI, LLMChain
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage

In [8]:
OPENTYPHOON_API_KEY = OPENTYPHOON_API_KEY
base_url = "https://api.opentyphoon.ai/v1"

llm = ChatOpenAI(
    model="typhoon-v1.5-instruct",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key= OPENTYPHOON_API_KEY,
    base_url= base_url,
)

In [9]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1352b6790>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x134ce1490>, model_name='typhoon-v1.5-instruct', temperature=0.0, openai_api_key=SecretStr('**********'), openai_api_base='https://api.opentyphoon.ai/v1', openai_proxy='')

In [10]:
messages = [
    SystemMessage(content="Translate the following from English into Thai"),
    HumanMessage(content="dog!"),
]


In [11]:
llm.invoke(messages).content

'สุนัข!'

# langchan Table Agent

In [12]:
import pandas as pd
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [13]:
class pandas_code_Agent():
    def __init__(self,dataframe,prefix_prompt,input_prompt) -> None:
        self.llm = ChatOpenAI(
            model="typhoon-v1.5-instruct",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2,
            api_key= OPENTYPHOON_API_KEY,
            base_url= base_url,
            )
        self.dataframe = dataframe
        self.prefix_prompt = prefix_prompt
        self.input_prompt =input_prompt
        self.agent = create_pandas_dataframe_agent(
            self.llm,
            self.dataframe,
            verbose = True,
            agent_type = AgentType.OPENAI_FUNCTIONS,
            prefix = prefix_prompt,
            allow_dangerous_code=True
            )
    
    def create_code(self):
        return self.agent.invoke(self.input_prompt)

In [14]:
df = pd.read_csv('./stock_full.csv')

In [15]:
pandas_agent = pandas_code_Agent(dataframe=df,prefix_prompt="you are the best data sciantist, that give output ONLY A CODE",
                               input_prompt="Do feature extraction from time in this data frame")

In [16]:
answer = pandas_agent.create_code()



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo perform feature extraction on the time-related features in the given DataFrame, you can use the following Python code:

```python
import pandas as pd
from datetime import datetime

# Assuming 'df' is your DataFrame
df = pd.DataFrame({'day': [17, 10, 31, 29, 30],
                   'onth': [6, 8, 7, 9, nan],
                   'target': [0, 1.16162, 0, 0, 97.4578]})

# Extracting year and month names
df['year'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).year) \
   .fillna(method='ffill').astype(int)
df['month_name'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).strftime('%B')) \
   .fillna(method='ffill')

# Printing the updated DataFrame
print(df)
```

This code will create a new DataFrame with additional columns for the extracted features: 'year' and 'onth_name'. The 'year' column is created by filling the missing values with the previous non-missing value's year. The 'onth_name' column is created by conv

In [17]:
answer

{'input': 'Do feature extraction from time in this data frame',
 'output': "To perform feature extraction on the time-related features in the given DataFrame, you can use the following Python code:\n\n```python\nimport pandas as pd\nfrom datetime import datetime\n\n# Assuming 'df' is your DataFrame\ndf = pd.DataFrame({'day': [17, 10, 31, 29, 30],\n                   'onth': [6, 8, 7, 9, nan],\n                   'target': [0, 1.16162, 0, 0, 97.4578]})\n\n# Extracting year and month names\ndf['year'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).year) \\\n   .fillna(method='ffill').astype(int)\ndf['month_name'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).strftime('%B')) \\\n   .fillna(method='ffill')\n\n# Printing the updated DataFrame\nprint(df)\n```\n\nThis code will create a new DataFrame with additional columns for the extracted features: 'year' and 'onth_name'. The 'year' column is created by filling the missing values with the previous non-missing value's year. The

In [18]:
answer['output']

"To perform feature extraction on the time-related features in the given DataFrame, you can use the following Python code:\n\n```python\nimport pandas as pd\nfrom datetime import datetime\n\n# Assuming 'df' is your DataFrame\ndf = pd.DataFrame({'day': [17, 10, 31, 29, 30],\n                   'onth': [6, 8, 7, 9, nan],\n                   'target': [0, 1.16162, 0, 0, 97.4578]})\n\n# Extracting year and month names\ndf['year'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).year) \\\n   .fillna(method='ffill').astype(int)\ndf['month_name'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).strftime('%B')) \\\n   .fillna(method='ffill')\n\n# Printing the updated DataFrame\nprint(df)\n```\n\nThis code will create a new DataFrame with additional columns for the extracted features: 'year' and 'onth_name'. The 'year' column is created by filling the missing values with the previous non-missing value's year. The 'onth_name' column is created by converting the month number to a string r

# langchain Agent

In [19]:
class normal_Agent():
    def __init__(self,system_prmopt) -> None:
        self.llm = ChatOpenAI(
            model="typhoon-v1.5-instruct",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2,
            api_key= OPENTYPHOON_API_KEY,
            base_url= base_url,
            )
        self.system_prmopt = system_prmopt
        
    def create_code(self,user_prmopt): 
         return self.llm.invoke([
            SystemMessage(content=self.system_prmopt),
            HumanMessage(content=user_prmopt),
            ]).content
    
    

In [20]:
select_agent = normal_Agent("""From the text that I given. You have to give me only a feature extraction code.
                            Input:
                            The feature extraction code provided is written in Python using the Pandas library. It extracts two time-related features from the given DataFrame: 'year' and 'month_name'. Here's the code snippet again for reference:

                            ```python
                            import pandas as pd
                            from datetime import datetime

                            # Assuming 'df' is your DataFrame
                            df = pd.DataFrame({'day': [17, 10, 31, 29, 30],
                                               'onth': [6, 8, 7, 9, nan],
                                               'target': [0, 1.16162, 0, 0, 97.4578]})

                            # Extracting
                            df['Weight (kg.)/Length (m.)'] = df['Weight (kg.)']/df['Length (m.)']
                            df['thickness_mean'] df['thickness'].mean(

                            # Printing the updated DataFrame
                            print(df)
                            ```
                            
                            Output:
                            df['Weight (kg.)/Length (m.)'] = df['Weight (kg.)']/df['Length (m.)']
                            df['thickness_mean'] df['thickness'].mean()""")

In [21]:
print(select_agent.create_code(answer['output']))

Thank you for providing the feature extraction code. Based on the code you provided, I will now extract the relevant lines of code that perform the feature extraction.

Here are the lines of code that extract the 'year' and 'month_name' features from the given DataFrame:

```python
df['year'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).year) \
   .fillna(method='ffill').astype(int)
df['month_name'] = df['month'].apply(lambda x: datetime(2000+x, 1, 1).strftime('%B')) \
   .fillna(method='ffill')
```

These lines of code will create a new DataFrame with additional columns for the extracted features: 'year' and 'month_name'. The 'year' column is created by filling the missing values with the previous non-missing value's year. The 'month_name' column is created by converting the month number to a string representation of the month name, filling the missing values with the previous non-missing value's month name.


# All pipeline

In [22]:
# read data frame
df = pd.read_csv('./stock_full.csv')

In [23]:
# Create Data sciantist Agent
pandas_agent_1 = pandas_code_Agent(dataframe=df,prefix_prompt="you are the best data sciantist, that expert in Machine Learning",
                               input_prompt="Do feature extraction from time in this data frame")
pandas_agent_2 = pandas_code_Agent(dataframe=df,prefix_prompt="you are the best data sciantist, that give output ONLY A CODE",
                               input_prompt="Do feature extraction from time in this data frame")
pandas_agent_3 = pandas_code_Agent(dataframe=df,prefix_prompt="you are the best data sciantist, that give output ONLY A CODE",
                               input_prompt="Do feature extraction from time in this data frame")


# Feature select Agent
select_code_agent = normal_Agent("you are Machine learning Engineer")



# rule base to get all code

#
# select_feature_agent


In [26]:
# Create Data sciantist Agent
pandas_agent_1 = pandas_code_Agent(dataframe=df,prefix_prompt="You are an expert data scientist with deep knowledge in feature engineering, data preprocessing, and machine learning. (GIVE ONLY CODE OUTPUT, USE ONLY NUMPY, PANDAS, SCIKIT-LEARN)",
                               input_prompt="Your task is to analyze the given dataset, represented as a pandas DataFrame from `pd.read_csv`, and extract the most relevant features to improve the performance of a machine learning model. Consider techniques such as removing irrelevant or redundant features, handling missing values, encoding categorical variables, and scaling numerical features.")
# Your task is to analyze the given dataset, represented as a pandas DataFrame from `pd.read_csv`, and extract the most relevant features to improve the performance of a machine learning model. 
# Consider techniques such as removing irrelevant or redundant features, handling missing values, encoding categorical variables, and scaling numerical features.
pandas_agent_2 = pandas_code_Agent(dataframe=df,prefix_prompt="You are an expert data scientist with deep knowledge in feature engineering, data preprocessing, and machine learning. (GIVE ONLY CODE OUTPUT, USE ONLY NUMPY, PANDAS, SCIKIT-LEARN)",
                               input_prompt="Your task is to enhance the given dataset's usability for machine learning models by identifying the most important features and applying appropriate transformations. The dataset is loaded into a pandas DataFrame using `pd.read_csv`.\nConsider the following steps:\n1. **Correlation Analysis**: Calculate and analyze correlations between features to identify highly correlated pairs. Suggest removing or combining features with high collinearity.\n2. **Dimensionality Reduction**: If applicable, apply dimensionality reduction techniques (e.g., PCA, LDA) to reduce the feature space while retaining the most informative features.\n3. **Feature Creation**: Generate new features from existing ones by applying transformations (e.g., polynomial features, logarithmic transformations) that could potentially improve model performance.")
# Your task is to enhance the given dataset's usability for machine learning models by identifying the most important features and applying appropriate transformations. The dataset is loaded into a pandas DataFrame using `pd.read_csv`.
# Consider the following steps:
# 1. **Correlation Analysis**: Calculate and analyze correlations between features to identify highly correlated pairs. Suggest removing or combining features with high collinearity.
# 2. **Dimensionality Reduction**: If applicable, apply dimensionality reduction techniques (e.g., PCA, LDA) to reduce the feature space while retaining the most informative features.
# 3. **Feature Creation**: Generate new features from existing ones by applying transformations (e.g., polynomial features, logarithmic transformations) that could potentially improve model performance.
pandas_agent_3 = pandas_code_Agent(dataframe=df,prefix_prompt="You are an expert data scientist with deep knowledge in feature engineering, data preprocessing, and machine learning. (GIVE ONLY CODE OUTPUT, USE ONLY NUMPY, PANDAS, SCIKIT-LEARN)",
                               input_prompt="Your task is to thoroughly clean and prepare the given dataset for feature extraction and machine learning model training. The dataset is loaded into a pandas DataFrame using `pd.read_csv`.\nFollow these steps:\n1. **Data Type Optimization**: Review the data types of all features and optimize them to reduce memory usage (e.g., convert object types to categories if they have a low cardinality).\n2. **Outlier Detection and Treatment**: Detect and handle outliers in numerical features using appropriate methods (e.g., IQR, Z-score) to minimize their impact on the model.\n3. **Normalization**: Normalize or standardize numerical features to bring them to a common scale, particularly if the data will be used with distance-based models or gradient descent methods.\n4. **Feature Scaling for Different Model Types**: Provide suggestions on different scaling methods that may be more suitable for different types of machine learning models (e.g., tree-based vs. linear models).")
# Your task is to thoroughly clean and prepare the given dataset for feature extraction and machine learning model training. The dataset is loaded into a pandas DataFrame using `pd.read_csv`.
# Follow these steps:
# 1. **Data Type Optimization**: Review the data types of all features and optimize them to reduce memory usage (e.g., convert object types to categories if they have a low cardinality).
# 2. **Outlier Detection and Treatment**: Detect and handle outliers in numerical features using appropriate methods (e.g., IQR, Z-score) to minimize their impact on the model.
# 3. **Normalization**: Normalize or standardize numerical features to bring them to a common scale, particularly if the data will be used with distance-based models or gradient descent methods.
# 4. **Feature Scaling for Different Model Types**: Provide suggestions on different scaling methods that may be more suitable for different types of machine learning models (e.g., tree-based vs. linear models).

# Feature select Agent
select_code_agent = normal_Agent("you are Machine learning Engineer")



# rule base to get all code

#
# select_feature_agent

In [30]:
%%time
answer1 = pandas_agent_1.create_code()
answer2 = pandas_agent_2.create_code()
answer3 = pandas_agent_3.create_code()



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mFirst, let's import the necessary libraries and load the dataset. Then we will handle missing values, encode categorical variables, and scale numerical features. Finally, we will select the most relevant features based on their correlation with the target variable.

```python
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

df = pd.read_csv('data.csv')

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
df['ยี่ห้อ'] = le.fit_transform(df['ยี่ห้อ'])
df['AZ'] = le.fit_transform(df['AZ'])
df['ลูกที่'] = le.fit_transform(df['ลูกที่'])
df['Colour'] = le.fit_transform(df['Colour'])
df['day'] = le.fit_transform(df['day'])
df['month'] = le.fit_transform(df['month'])

# Scale numerical features
scaler = StandardScaler()
df[['Weight (kg.)', 'Length (

In [38]:
print(answer1['output'])

First, let's import the necessary libraries and load the dataset. Then we will handle missing values, encode categorical variables, and scale numerical features. Finally, we will select the most relevant features based on their correlation with the target variable.

```python
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

df = pd.read_csv('data.csv')

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
df['ยี่ห้อ'] = le.fit_transform(df['ยี่ห้อ'])
df['AZ'] = le.fit_transform(df['AZ'])
df['ลูกที่'] = le.fit_transform(df['ลูกที่'])
df['Colour'] = le.fit_transform(df['Colour'])
df['day'] = le.fit_transform(df['day'])
df['month'] = le.fit_transform(df['month'])

# Scale numerical features
scaler = StandardScaler()
df[['Weight (kg.)', 'Length (m.)', 'kg./m.']] = scaler.fit_transform(df[['Weight (kg.)', '

In [35]:
print(select_agent.create_code(answer1['output']))

The most relevant features are: ['ยี่ห้อ', 'AZ', 'ลูกที่', 'Colour', 'day']


In [39]:
print(answer2['output'])

```python
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('data.csv')

# Correlation Analysis
corr_matrix = df.corr()
print(corr_matrix)

# Identify highly correlated pairs
highly_correlated_pairs = [(corr_matrix.columns[i], corr_matrix.columns[j]) for i in range(len(corr_matrix.columns))
                            for j in range(i) if abs(corr_matrix.iloc[i, j]) > 0.95]

print(highly_correlated_pairs)

# Dimensionality Reduction
features = df.drop(['target'], axis=1)
labels = df['target']

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

pca = PCA(n_components=2)
features_reduced = pca.fit_transform(features_scaled)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(features_reduced, labels)

# Feature Creation
df['new_feature'] = df['feature'].apply(lambda x: x**2)

# Split dataset into training and testing

In [36]:
print(select_agent.create_code(answer2['output']))

The feature extraction code provided is written in Python using the Pandas, NumPy, and Scikit-learn libraries. It performs the following tasks:

1. Loads a dataset from a CSV file named 'data.csv'.
2. Calculates the correlation matrix of the dataset.
3. Identifies highly correlated pairs of features with a correlation coefficient greater than 0.95.
4. Performs dimensionality reduction using Principal Component Analysis (PCA) with 2 components.
5. Scales the data using StandardScaler.
6. Trains a logistic regression model on the reduced and scaled features.
7. Creates a new feature by squaring an existing feature.
8. Splits the dataset into training and testing sets with a test size of 0.2.
9. Evaluates the model's performance on the testing set using the accuracy score metric.


In [40]:
print(answer3['output'])

```python
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Step 1: Data Type Optimization
df = df.astype('float64') # Convert object types to float64

# Step 2: Outlier Detection and Treatment
numerical_features = ['AZ', 'ลูกที่', 'thickness', 'Weight (kg.)', 'Length (m.)', 'kg./m.', 'day', 'onth']
for col in numerical_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Handle outliers with Z-score method
    z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
    df.loc[z_scores > 3, col] = np.nan
    
    # Impute missing values with median
    df[col].fillna(df[col].median(), inplace=True)

# Step 3: Normalization
scaler = StandardScaler()
df[['AZ', 'ลูกที่', 'thickness', 'Weight (kg.)', 'Leng

In [37]:
print(select_agent.create_code(answer3['output']))

```python
        ('imputer', SimpleImputer(strategy='median'), ['AZ', 'ลูกที่', 'thickness', 'Weight (kg.)', 'Length (m.)', 'kg./m.']),
        ('scaler', scaler, ['AZ', 'ลูกที่', 'thickness', 'Weight (kg.)', 'Length (m.)', 'kg./m.']),
        ('encoder', StandardScaler(), ['day', 'onth'])
    ])
],
    ('model', Pipeline([
        ('feature_selection', SelectKBest(chi2, k=5)),
        ('regression', LinearRegression())
    ]))
)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")
```
