In [5]:
#
#
# Train the model 
#
#
# train_model.py
import pandas as pd #(for handling and processing tabular data.)
from sklearn.ensemble import RandomForestClassifier #(ML mode from sklearn.ensemble , collection of decision trees)
from sklearn.multioutput import MultiOutputClassifier#(wrapper that allows a classifier to handle multiple target)
import joblib#(library to save and load Python objects and for store trined models)

def train_graph_model(): 
    
    df = pd.read_csv("Recommendation_graph_data.csv") #Load the dataset ude for training 
    X = df[["num_num", "num_cat", "has_dt"]] #features
    #number of numerical,categorical,boolean flag indicating datetime
    y = df.drop(columns=["num_num", "num_cat", "has_dt"]) #Labels (Drop feature columns to get lables
    
    # Train model
    model = MultiOutputClassifier(
        RandomForestClassifier(n_estimators=100, random_state=42) #use 100 decision trees #random_state - split happens in same way
    )
    model.fit(X, y) #trains the model on the dataset
    
    # Save model
    joblib.dump(model, "graph_recommender_model.pkl") #save as .pkl file
    print("Model trained and saved!")

if __name__ == "__main__":#Run it directly → Code runs. #Import it → Code does not run.


    train_graph_model()

Model trained and saved!


In [1]:
import pandas as pd #handle data
import joblib #load pretrained models

def analyze_columns(df):
    num_num = 0
    num_cat = 0
    has_dt = 0 # number of columns( numerical,categorical, and has datetime)
    
    for col in df.columns: #df.columns-> dataset columns
        dtype = df[col].dtype#type of coulumn int64,object(string),float
        
        if pd.api.types.is_numeric_dtype(dtype): #pandas API to mannupulate data types
            num_num += 1
        else:
           
            temp_col = pd.to_datetime(df[col], errors="coerce")#convert to datetime ,coerce-> Cannot convert replace with NAN

            if temp_col.notna().sum() > 0:  # At least one valid datetime (sum of not a time rows),notna-> for check missing vlues
                has_dt = 1
            else:
                num_cat += 1

    return num_num, num_cat, has_dt


def get_user_columns(dataset_path):#prompts the user to input column names from a dataset..This is passed by NLP part after SQL query generation
    df = pd.read_csv(dataset_path)
    print("\nAvailable columns:", df.columns.tolist())
    
    while True:
        selected = input("Enter columns (comma-separated): ").strip().split(',')
        selected = [col.strip() for col in selected]
        valid_cols = [col for col in selected if col in df.columns]
        
        if valid_cols:
            return valid_cols
        print("Error: No valid columns selected. Try again.") 

def main():
    # Load model
    model = joblib.load("graph_recommender_model.pkl")
    
    # Get user input
    dataset_path = "Datasets/titanic.csv" # User database
    selected_cols = get_user_columns(dataset_path)
    
    # Analyze columns
    df = pd.read_csv(dataset_path)[selected_cols]
    num_num, num_cat, has_dt = analyze_columns(df)
    
    # Create input DataFrame with proper feature names
    input_df = pd.DataFrame(
        [[num_num, num_cat, has_dt]],
        columns=["num_num", "num_cat", "has_dt"]  # Match training feature names
    )
    
    # Predict
    pred = model.predict(input_df)
    
    # Show results
    graphs = ["Bar", "Line", "Scatter", "Area", "Histogram", "Box", 
             "Funnel", "Tree", "Pie"]
    recommendations = [g for g, p in zip(graphs, pred[0]) if p == 1]
    
    print("\nRecommended Graph Types:")
    print("- " + "\n- ".join(recommendations) if recommendations else "No suitable graphs found")

if __name__ == "__main__":
    main()


Available columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Enter columns (comma-separated):  Age,Fare



Recommended Graph Types:
- Scatter
