In [58]:
import pandas as pd
import glob
import os
folder_path = '/Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
# Function to process files based on their identifier (1, 2, 3)
def process_files(files, identifier):
    dataframes_list = []
    
    for file in files:
        df = pd.read_csv(file)

        # Add 'Fund Type' based on the file name
        if 'Debt' in file:
            df['Fund Type'] = 'Debt'
        elif 'Equity' in file:
            df['Fund Type'] = 'Equity'
        elif 'Hybrid' in file:
            df['Fund Type'] = 'Hybrid'
        elif 'Commodity' in file:
            df['Fund Type'] = 'Commodity'
        else:
            pass

        print(f"Processed file: {file}")
        dataframes_list.append(df)

    # Combine the DataFrames if any were processed
    if dataframes_list:
        combined_df = pd.concat(dataframes_list, ignore_index=True)
        print(f"Combined DataFrame {identifier}: Length = {len(combined_df)}")
        return combined_df
    else:
        print(f"No files with '{identifier}' in the name were found.")
        return None

# Filter files with '1', '2', and '3' in the name
files_with_one = [file for file in csv_files if '1' in file]
files_with_two = [file for file in csv_files if '2' in file]
files_with_three = [file for file in csv_files if '3' in file]

# Process each set of files
df_1 = process_files(files_with_one, '1')
df_2 = process_files(files_with_two, '2')
df_3 = process_files(files_with_three, '3')

# Merge the DataFrames on the common column (assumed to be the first column)
if df_1 is not None and df_2 is not None and df_3 is not None:
    combined_df = pd.merge(df_1, df_2, on=df_1.columns[0], how='outer')
    combined_df = pd.merge(combined_df, df_3, on=combined_df.columns[0], how='outer')

    # Step 1: Drop columns with '_x' and '_y' suffixes, but keep the original column
    columns_to_drop = [col for col in combined_df.columns if col.endswith('_x') or col.endswith('_y')]
    combined_df.drop(columns=columns_to_drop, inplace=True)

    # Step 2: Exclude any columns that contain '%Other%' in their name
    columns_to_keep = [col for col in combined_df.columns if 'Other' not in col]
    combined_df = combined_df[columns_to_keep]

    print("Combined DataFrame after dropping '_x', '_y' and excluding '%Other%' columns:")
    # print(combined_df.columns)
else:
    print("One or more DataFrames were empty, skipping the merge.")

Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Equity_1.csv
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Hybrid_1.csv
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Debt_1.csv
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Commodity_1.csv
Combined DataFrame 1: Length = 1461
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Equity_2.csv
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Hybrid_2.csv
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Debt_2.csv
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Commodity_2.csv
Combined DataFrame 2: Length = 1461
Processed file: /Users/souravm/Documents/fund_analysis/mutualfund_analysis/Raw_data/MF_Equity_3.csv
Processed file: /Users/sou

In [57]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1467 entries, 0 to 1466
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Name                   1467 non-null   object 
 1   Expense Ratio          1467 non-null   float64
 2   Absolute Returns - 3M  1467 non-null   float64
 3   Absolute Returns - 6M  1467 non-null   float64
 4   Absolute Returns - 1Y  1467 non-null   float64
 5   CAGR 3Y                1467 non-null   float64
 6   CAGR 5Y                1467 non-null   float64
 7   CAGR 10Y               1467 non-null   float64
 8   Alpha                  1467 non-null   float64
 9   Volatility             1467 non-null   float64
 10  Category St Dev        1467 non-null   float64
 11  SEBI Risk Category     1467 non-null   object 
 12  % Debt Holding         1467 non-null   float64
 13  % Equity Holding       1467 non-null   float64
 14  % Largecap Holding     1467 non-null   float64
 15  % Mi