In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

def clean_data(data):
    data = data.copy()

    numeric_columns = ['min_sip', 'min_lumpsum', 'expense_ratio', 'fund_size_cr',
                       'fund_age_yr', 'sortino', 'alpha', 'sd', 'beta', 'sharpe',
                       'risk_level', 'rating', 'returns_1yr', 'returns_3yr', 'returns_5yr']

    for column in numeric_columns:
        if column in data.columns:
            data[column] = pd.to_numeric(data[column], errors='coerce')
            data[column].fillna(data[column].median(), inplace=True)

    data = pd.get_dummies(data, columns=['amc_name', 'category', 'sub_category'], drop_first=True)
    return data

def calculate_return(yearly_return, months):
    return yearly_return * (months / 12)

def feature_engineering_and_grid_search(data, months):
    print(f"Feature engineering and grid search for {months}-month returns...")

    basic_features = ['expense_ratio', 'fund_size_cr', 'fund_age_yr',
                      'sortino', 'alpha', 'sd', 'beta', 'sharpe', 'risk_level']

    X = data[basic_features]

    if months <= 12:
        y = data['returns_1yr'].apply(lambda x: calculate_return(x, months))
    elif months <= 36:
        y = data['returns_3yr'].apply(lambda x: calculate_return(x, months))
    else:
        y = data['returns_5yr'].apply(lambda x: calculate_return(x, months))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    selector = SelectKBest(score_func=f_regression, k='all')
    X_train_selected = selector.fit_transform(X_train_scaled, y_train)
    X_test_selected = selector.transform(X_test_scaled)

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'min_child_weight': [1, 5, 10]
    }

    grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42),
                               param_grid=param_grid, cv=3, scoring='r2', verbose=1)
    grid_search.fit(X_train_selected, y_train)

    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")

    y_pred_test = best_model.predict(X_test_selected)

    r2 = r2_score(y_test, y_pred_test)
    mse = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(mse)

    print(f"R² Score: {r2:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")

    return best_model, selector, scaler, r2, mse, rmse

def predict_returns(data, months, recommendations=5):
    print(f"Predicting {months}-month returns...")
    processed_data = clean_data(data)

    try:
        model, selector, scaler, r2, mse, rmse = feature_engineering_and_grid_search(processed_data, months)

        all_data_scaled = scaler.transform(processed_data[['expense_ratio', 'fund_size_cr', 'fund_age_yr',
                                                           'sortino', 'alpha', 'sd', 'beta', 'sharpe', 'risk_level']])
        all_data_selected = selector.transform(all_data_scaled)
        predictions = model.predict(all_data_selected)

        results = pd.DataFrame({
            'fund_name': data['scheme_name'],
            f'predicted_{months}m_return': predictions,
            'risk_level': data['risk_level'],
            'expense_ratio': data['expense_ratio']
        })

        top_recommendations = results.sort_values(f'predicted_{months}m_return', ascending=False).head(recommendations)

        print("Prediction completed.")
        return top_recommendations, r2, mse, rmse

    except Exception as e:
        print(f"Error during prediction: {e}")
        return None

def run_analyzer():
    print("Welcome to the Investment Return Analyzer")

    try:
        data = pd.read_csv('/content/sample_data/comprehensive_mutual_funds_data.csv')
        print("Data loaded")

        while True:
            print("\nChoose prediction period:")
            print("1: One month")
            print("3: Three months")
            print("6: Six months")
            print("12: One year")

            months = int(input("Enter number of months (1, 3, 6, 12): "))
            recommendations = int(input("Enter number of recommendations (1-10): "))

            top_recommendations, r2, mse, rmse = predict_returns(data, months, recommendations)

            if top_recommendations is not None:
                print(top_recommendations)
                print(f"R² Score: {r2:.2f}")
                print(f"Mean Squared Error: {mse:.2f}")
                print(f"Root Mean Squared Error: {rmse:.2f}")

            continue_analysis = input("Analyze another period? (yes/no): ").strip().lower()
            if continue_analysis != 'yes':
                break

    except FileNotFoundError:
        print("Data file not found")

    except Exception as e:
        print(f"Unexpected error: {e}")




run_analyzer()


Welcome to the Investment Return Analyzer
Data loaded

Choose prediction period:
1: One month
3: Three months
6: Six months
12: One year
Enter number of months (1, 3, 6, 12): 6
Enter number of recommendations (1-10): 5
Predicting 6-month returns...
Feature engineering and grid search for 6-month returns...
Error during prediction: 'super' object has no attribute '__sklearn_tags__'
Unexpected error: cannot unpack non-iterable NoneType object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].median(), inplace=True)
