In [1]:
import numpy as np
import pandas as pd

## Data retribval and mapping of category

In [2]:
data = pd.read_csv('./data_with_sub_categories.csv')
data.head()

Unnamed: 0,scheme_name,Scheme Codes,expense_ratio,risk_level,returns_1yr,returns_3yr,returns_5yr,Sub_Category
0,Aditya Birla SL Active Debt Multi-Mgr FoF-Dir ...,100033,0.27,0.2,4.0,6.5,6.9,FoFs Domestic
1,Aditya Birla SL Arbitrage Fund,100034,0.36,0.0,5.6,4.8,5.5,Arbitrage Mutual Funds
2,Aditya Birla SL Asset Allocator FoF-Dir Growth,100037,0.53,0.4,2.0,18.9,9.7,FoFs Domestic
3,Aditya Birla SL Balanced Advantage Fund,100038,0.61,0.5,4.5,18.6,9.7,Dynamic Asset Allocation or Balanced Advantage
4,Aditya Birla SL Banking&Financial Services-Dir...,100041,1.17,0.5,5.3,24.6,9.2,Sectoral / Thematic Mutual Funds


In [3]:
# Create a dictionary to map indices to scheme_codes
index_to_scheme_code = dict(zip(data.index, data['Scheme Codes']))

# Example: Get the scheme_code corresponding to index 0
print("Scheme Code for index 0:", index_to_scheme_code)

Scheme Code for index 0: {0: 100033, 1: 100034, 2: 100037, 3: 100038, 4: 100041, 5: 100042, 6: 100043, 7: 100044, 8: 100046, 9: 100047, 10: 100048, 11: 100049, 12: 100051, 13: 100052, 14: 100053, 15: 100054, 16: 100055, 17: 100056, 18: 100057, 19: 100058, 20: 100059, 21: 100060, 22: 100061, 23: 100062, 24: 100063, 25: 100064, 26: 100065, 27: 100066, 28: 100067, 29: 100956, 30: 100970, 31: 100971, 32: 100972, 33: 101313, 34: 101314, 35: 101315, 36: 101316, 37: 101317, 38: 101318, 39: 101591, 40: 112210, 41: 112211, 42: 112212, 43: 112213, 44: 112214, 45: 112215, 46: 112216, 47: 112217, 48: 112277, 49: 112278, 50: 112322, 51: 112323, 52: 112354, 53: 112355, 54: 112369, 55: 112679, 56: 112680, 57: 112681, 58: 112682, 59: 112683, 60: 112684, 61: 112712, 62: 118484, 63: 109255, 64: 109264, 65: 109269, 66: 110606, 67: 110607, 68: 110608, 69: 111585, 70: 111589, 71: 111590, 72: 111709, 73: 111710, 74: 113036, 75: 119333, 76: 119334, 77: 119340, 78: 119354, 79: 119399, 80: 119400, 81: 119415, 

In [4]:
unique_values = data['Sub_Category'].unique()
len(unique_values)

36

In [5]:
category_mapping = {unique_values[i]: i+1 for i in range(len(unique_values))}
category_mapping

{'FoFs Domestic': 1,
 'Arbitrage Mutual Funds': 2,
 'Dynamic Asset Allocation or Balanced Advantage': 3,
 'Sectoral / Thematic Mutual Funds': 4,
 'Banking and PSU Mutual Funds': 5,
 'Corporate Bond Mutual Funds': 6,
 'Credit Risk Funds': 7,
 'Dividend Yield Funds': 8,
 'Dynamic Bond': 9,
 'Large & Mid Cap Funds': 10,
 'Aggressive Hybrid Mutual Funds': 11,
 'Equity Savings Mutual Funds': 12,
 'Flexi Cap Funds': 13,
 'Floater Mutual Funds': 14,
 'Focused Funds': 15,
 'Large Cap Mutual Funds': 16,
 'Gilt Mutual Funds': 17,
 'Medium to Long Duration Funds': 18,
 'Liquid Mutual Funds': 19,
 'Low Duration Funds': 20,
 'Medium Duration Funds': 21,
 'Mid Cap Mutual Funds': 22,
 'Money Market Funds': 23,
 'Index Funds': 24,
 'Value Funds': 25,
 'Conservative Hybrid Mutual Funds': 26,
 'Ultra Short Duration Funds': 27,
 'Short Duration Funds': 28,
 'Small Cap Mutual Funds': 29,
 'ELSS Mutual Funds': 30,
 'Childrens Funds': 31,
 'Multi Asset Allocation Mutual Funds': 32,
 'Multi Cap Funds': 33,
 

## User Input 

In [6]:
def calculate_expense_ratio(investment_budget):
    # Define expense ratio range for the Indian market (0.01 to 0.05)
    min_expense_ratio = 0.01
    max_expense_ratio = 0.05
    
    # Normalize investment budget between 0 and 1
    normalized_budget = investment_budget / 10000  # Assuming maximum investment budget is 500000 INR
    
    # Calculate normalized expense ratio within the range
    normalized_expense_ratio = min_expense_ratio + (max_expense_ratio - min_expense_ratio) * normalized_budget
    
    # Apply bias to expense ratio
    biased_expense_ratio = normalized_expense_ratio
    
    # If expense ratio is greater than 0.5, set it to 0.5
    if biased_expense_ratio > 0.5:
        biased_expense_ratio = 0.5
    
    return biased_expense_ratio

In [7]:
# Works only on yearly income
def calculate_risk(age, income):
    if age <= 30:
        if income >= 1000000:
            return 6
        elif income >= 700000:
            return 5
        elif income >= 500000:
            return 4
        else:
            return 3
    elif age <= 40:
        if income >= 1200000:
            return 5
        elif income >= 800000:
            return 4
        elif income >= 600000:
            return 3
        else:
            return 2
    elif age <= 50:
        if income >= 1500000:
            return 4
        elif income >= 1000000:
            return 3
        elif income >= 800000:
            return 2
        else:
            return 1
    else:
        if income >= 2000000:
            return 3
        elif income >= 1500000:
            return 2
        elif income >= 1000000:
            return 1
        else:
            return 0



In [8]:
def normalize_risk(risk):
    # Normalize risk value between 0 and 1
    normalized_risk = risk / 6.0  # Dividing by the maximum possible value
    
    # Apply bias to expense ratio
    biased_normalized_risk = normalized_risk
    
    # If expense ratio is greater than 0.5, set it to 0.5
    if biased_normalized_risk > 0.5:
        biased_normalized_risk = 0.5
    
    return biased_normalized_risk
    

In [None]:
def get_user_inputs():
    age = int(input("Enter your age: "))
    income = float(input("Enter your income: "))
    investment_budget = float(input("Enter your investment budget: "))
    
    subcategories = []
    while True:
        subcategory = input("Enter a subcategory (or type 'done' to finish): ")
        if subcategory.lower() == 'done':
            break
        subcategories.append(float(subcategory))  # Convert input to float and append to list
    
    investment_term = input("Enter the investment term (short term, midterm, or high term): ")
    investment_term_map = {'short term': 1, 'midterm': 3, 'high term': 5}
    investment_term_numeric = investment_term_map.get(investment_term.lower(), None)
    
    return age, income, investment_budget, subcategories, investment_term_numeric

# Example usage:
age, income, investment_budget, subcategories, investment_term = get_user_inputs()
print("Age:", age)
print("Income:", income)
print("Investment Budget:", investment_budget)
print("Subcategories:", subcategories)
print("Investment Term:", investment_term)




Enter your age:  25
Enter your income:  1000000


In [None]:
def calculate_user_vector(age, income, investment_budget):
    expense_ratio = calculate_expense_ratio(investment_budget)
    risk = calculate_risk(age, income)
    normalised_risk = normalize_risk(risk)
    return [expense_ratio, normalised_risk]

In [None]:
vector = calculate_user_vector(age, income, investment_budget)
print("Vector:", vector)

## Filtering 

In [None]:
# Assuming you have already created the category_mapping dictionary
# Reverse the dictionary to map integers back to category names
reverse_category_mapping = {v: k for k, v in category_mapping.items()}

# Assuming you have already created the reverse_category_mapping dictionary
# Assuming you have an array of values called values

values = subcategories # Example array of values

# Create an empty list to store the category names
category_names = []

# Loop through the array of values and get the corresponding category names
for value in values:
    category_name = reverse_category_mapping.get(value, "Unknown")
    category_names.append(category_name)

print(category_names)

This is the new dataset after filtering 

In [None]:
filtered_df = data[data['Sub_Category'].isin(category_names)]
filtered_df

In [None]:
columns_to_drop = ['scheme_name', 'Scheme Codes', 'returns_3yr', 'returns_5yr']
first_year_returns_data = filtered_df.drop(columns=columns_to_drop)
first_year_returns_data

In [None]:
columns_to_drop = ['scheme_name', 'Scheme Codes', 'returns_1yr', 'returns_5yr']
third_year_returns_data = filtered_df.drop(columns=columns_to_drop)
third_year_returns_data

In [None]:
columns_to_drop = ['scheme_name', 'Scheme Codes', 'returns_1yr', 'returns_3yr']
fifth_year_returns_data = filtered_df.drop(columns=columns_to_drop)
fifth_year_returns_data

In [None]:
if investment_term == 1:
    # Copy data1 to a new DataFrame
    returns_data = first_year_returns_data.copy()
    user_data = {
    'expense_ratio': [vector[0]],
    'risk_level': [vector[1]],
    'returns_1yr': [0.75]
}
elif investment_term == 3:
    returns_data = third_year_returns_data.copy()
    user_data = {
    'expense_ratio': [vector[0]],
    'risk_level': [vector[1]],
    'returns_3yr': [0.75]
}
elif investment_term == 5:
    returns_data = fifth_year_returns_data.copy()
    user_data = {
    'expense_ratio': [vector[0]],
    'risk_level': [vector[1]],
    'returns_5yr': [0.75]
}
else:
    print("Invalid value. Please enter 1 or 3 or 5.")


In [None]:
returns_data

In [None]:
columns_to_drop = ['Sub_Category']
returns_data =  returns_data.drop(columns=columns_to_drop)
returns_data

In [None]:
# Assuming df is your DataFrame
index_to_row_number = {index: row_number for row_number, index in enumerate(returns_data.index)}

# Example usage:
print(index_to_row_number)

### normalisation

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit the scaler to the data and transform the data
returns_data_normalized = scaler.fit_transform(returns_data)

# Convert the scaled data back to a DataFrame
returns_data_normalized = pd.DataFrame(returns_data_normalized, columns=returns_data.columns)
first_year=returns_data_normalized.copy()

In [None]:
returns_data_normalized

In [None]:
user_df = pd.DataFrame(user_data)

# Append the user input data to the original DataFrame
returns_data_with_user = pd.concat([returns_data_normalized, user_df], ignore_index=True)

# Now your data DataFrame contains the user input data as a new row
print("Updated DataFrame:")
returns_data_with_user

## Cosine Similarity 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Calculate cosine similarity
returns_data_similarity = cosine_similarity(returns_data_with_user)

# Convert cosine similarity array to DataFrame for readability
returns_data_similarity = pd.DataFrame(returns_data_similarity, columns=returns_data_with_user.index, index=returns_data_with_user.index)

print("Cosine Similarity Matrix:")
returns_data_similarity

In [None]:
user_similarity=returns_data_similarity[len(returns_data_similarity)-1]

In [None]:
user_similarity

In [None]:
import numpy as np

# Exclude the last element of the array
array_without_last = user_similarity[:-1]

# Get the indices of the top 10 largest values
top_10_indices = np.argpartition(-array_without_last, kth=10)[:10]

# Print the indices of the top 10 largest values
print("Indices of the top 10 largest values:")
print(top_10_indices)

In [None]:
# Assuming you have already created the category_mapping dictionary
# Reverse the dictionary to map integers back to category names
reverse_category_mapping = {v: k for k, v in index_to_row_number.items()}

# Assuming you have already created the reverse_category_mapping dictionary
# Assuming you have an array of values called values

# Create an empty list to store the category names
scheme_index = []

# Loop through the array of values and get the corresponding category names
for value in top_10_indices:
    index = reverse_category_mapping.get(value, "Unknown")
    scheme_index.append(index)

print(scheme_index)

In [None]:
scheme_codes = []
for key in scheme_index:
    code = index_to_scheme_code.get(key)
    scheme_codes.append(code)

print(scheme_codes)

## Prediction

In [None]:
import requests
import pandas as pd
from prophet import Prophet

In [None]:
# Function create a dataframe by fetching the current data.import requests
def fetch_scheme_nav_data(scheme_code):
    api_url = f"https://api.mfapi.in/mf/{scheme_code}"
    response = requests.get(api_url)
    
    if response.status_code == 200:
        json_data = response.json()
        nav_data = json_data.get('data', [])
        
        if nav_data:
            nav_df = pd.DataFrame(nav_data)
            nav_df['date'] = pd.to_datetime(nav_df['date'], format='%d-%m-%Y')
            nav_df['nav'] = pd.to_numeric(nav_df['nav'], errors='coerce')
            nav_df['date'] = nav_df['date'].dt.strftime('%d-%m-%Y')
            nav_df = nav_df[::-1]
            return nav_df
        else:
            print("No NAV data available for the scheme.")
            return None
    else:
        print("Failed to fetch JSON data:", response.status_code)
        return None

In [None]:
# Function to predcit and return the predcited dataframe
def predict_nav(nav_df):
    
    # Check if the dataframe is small
    if len(nav_df) < 100:
        #print("Input dataframe is too small. Returning null dataframe.")
        return pd.DataFrame(columns=['ds', 'yhat'])
    # Step 1: Preprocess the data
    # Ensure 'date' column is in datetime format
    nav_df['ds'] = pd.to_datetime(nav_df['date'], format='%d-%m-%Y')
    # Rename 'nav' column to 'y' as required by Prophet
    nav_df.rename(columns={'nav': 'y'}, inplace=True)

    # Step 2: Train a forecasting model
    model = Prophet()
    model.fit(nav_df)

    # Step 3: Make predictions for the next 5 years
    future = model.make_future_dataframe(periods=365*5, freq='D')
    forecast = model.predict(future)
    
    # Filter out dates beyond the range of the original data
    future_dates = forecast[forecast['ds'] > nav_df['ds'].max()]['ds']

    # Create predicted_values DataFrame with only future dates and predicted NAV values
    predicted_values = forecast[forecast['ds'].isin(future_dates)][['ds', 'yhat']].copy()

    return predicted_values

In [None]:
# Function which will return the scores based on the predicted value
def calculate_returns(predicted_nav_df):
    
    
    if predicted_nav_df.empty:  # Check if the dataframe is empty
        return {}  # Return an empty dataframe if it's empty
    # Step 1: Sort the dataframe by date
    
    predicted_nav_df_sorted = predicted_nav_df.sort_values(by='ds')

    # Step 2: Calculate 1-year, 3-year, and 5-year returns
    def calculate_returns(nav_values, dates, years):
        returns = {}
        for year in years:
            # Calculate the NAV at the beginning and end of the period
            end_nav = nav_values.iloc[-1]
            start_date = max(dates) - pd.DateOffset(years=year)
            start_nav = nav_values[dates >= start_date].iloc[0]
                        # If start_nav is zero, replace it with 1
            if start_nav == 0:
                start_nav = 1
            # Calculate the return using the CAGR formula
            cagr = (end_nav / start_nav) ** (1/year) - 1
            returns[f"{year}-Year Return"] = cagr * 100  # Convert to percentage
        return returns

    # Extract NAV values and dates
    nav_values = predicted_nav_df_sorted['yhat']
    dates = predicted_nav_df_sorted['ds']

    # Calculate returns
    years = [1, 3, 5]
    returns = calculate_returns(nav_values, dates, years)

    return returns

In [None]:
# Function to create a dataframe with funds and scores
def calculate_scores_for_scheme_codes(scheme_codes, year):
    # Create an empty list to store the results
    results = []

    # Iterate over the list of scheme codes
    for scheme_code in scheme_codes:
        # Fetch NAV data for the scheme
        nav_df = fetch_scheme_nav_data(scheme_code)

        if nav_df is not None:
            # Predict NAV values
            predicted_nav_df = predict_nav(nav_df)

            # Calculate returns
            returns = calculate_returns(predicted_nav_df)
            #print(type(returns_dict))
            if not returns:
                results.append({
                    'Scheme Code': scheme_code,
                    '1-Year Return': -1,
                    '3-Year Return': -1,
                    '5-Year Return': -1
                })
            else:
                # Append scheme code and returns to the results list as a dictionary
                results.append({
                    'Scheme Code': scheme_code,
                    '1-Year Return': returns['1-Year Return'],
                    '3-Year Return': returns['3-Year Return'],
                    '5-Year Return': returns['5-Year Return']
                })
        else:
            print(f"Failed to fetch NAV data for scheme code {scheme_code}. Skipping...")

    # Create DataFrame from the list of dictionaries
    results_df = pd.DataFrame(results)

    returns = {}
    for index, row in results_df.iterrows():
        scheme_code = row['Scheme Code']
        if year == 1:
            returns[scheme_code] = row['1-Year Return']
        elif year == 3:
            returns[scheme_code] = row['3-Year Return']
        elif year == 5:
            returns[scheme_code] = row['5-Year Return']
        else:
            print("Invalid year! Please provide 1, 3, or 5 for year.")
            return None
    return returns


In [None]:
scores_df = calculate_scores_for_scheme_codes(scheme_codes,investment_term)
scores_df

In [None]:
def print_top_5_scores(scores_df):
    # Sort the dictionary by values in descending order
    sorted_scores = sorted(scores_df.items(), key=lambda x: x[1], reverse=True)
    
    # Print the top 5 key-value pairs
    print("Top 5 Key-Value Pairs:")
    for i, (key, value) in enumerate(sorted_scores[:5]):
        print(f"{i+1}. Key: {key}, Value: {value}")

In [None]:
print_top_5_scores(scores_df)