In [24]:
from zipfile import ZipFile
zip = ZipFile('Resources/archive.zip')
zip.extractall('Resources')

In [25]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

## Read In CSV data for Mutual Fund prices A-Z


In [26]:
#Read in CSV data 
#MutualFund prices A-E
df_AE= pd.read_csv(
    Path("Resources/MutualFund Prices - A-E.csv")
)
df_AE.head()

Unnamed: 0,fund_symbol,price_date,nav_per_share
0,AAAAX,2007-07-31,10.02
1,AAAAX,2007-08-01,9.98
2,AAAAX,2007-08-02,10.01
3,AAAAX,2007-08-03,9.9
4,AAAAX,2007-08-06,9.93


In [27]:
#Read in CSV data 
#MutualFund prices F-K
df_FK= pd.read_csv(
    Path("Resources/MutualFund Prices - F-K.csv")
)
df_FK.head()

Unnamed: 0,fund_symbol,price_date,nav_per_share
0,FAAAX,2013-11-20,10.08
1,FAAAX,2013-11-21,10.13
2,FAAAX,2013-11-22,10.17
3,FAAAX,2013-11-25,10.18
4,FAAAX,2013-11-26,10.2


In [28]:
#Read in CSV data 
#MutualFund prices L-P
df_LP= pd.read_csv(
    Path("Resources/MutualFund Prices - L-P.csv")
)
df_LP.head()

Unnamed: 0,fund_symbol,price_date,nav_per_share
0,LAACX,2013-09-03,10.0
1,LAACX,2013-09-04,10.05
2,LAACX,2013-09-05,9.99
3,LAACX,2013-09-06,10.06
4,LAACX,2013-09-09,10.15


In [29]:
#Read in CSV data 
#MutualFund prices Q-Z
df_QZ= pd.read_csv(
    Path("Resources/MutualFund Prices - Q-Z.csv")
)
df_QZ.head()

Unnamed: 0,fund_symbol,price_date,nav_per_share
0,QAACX,2003-02-12,10.03
1,QAACX,2003-02-13,10.03
2,QAACX,2003-02-14,10.18
3,QAACX,2003-02-18,10.35
4,QAACX,2003-02-19,10.28


In [30]:
#Concatonate dataframes
mutual_fund_df= pd.concat([df_AE,df_FK,df_LP,df_QZ])
mutual_fund_df.shape 

(75657739, 3)

In [31]:
mutual_fund_df.dtypes

fund_symbol       object
price_date        object
nav_per_share    float64
dtype: object

In [33]:
mutual_fund_df["price_date"]=pd.to_datetime(mutual_fund_df['price_date'])

In [34]:
mutual_fund_df.dtypes

fund_symbol              object
price_date       datetime64[ns]
nav_per_share           float64
dtype: object

# Read in Mutual Fund informaiton csv


In [60]:
#Read in CSV data 
#MutualFunds
mutualFunds= pd.read_csv(
    Path("Resources/MutualFunds.csv")
)
mutualFunds.head()

Unnamed: 0,fund_symbol,quote_type,region,fund_short_name,fund_long_name,currency,initial_investment,subsequent_investment,fund_category,fund_family,...,peer_environment_avg,peer_environment_max,social_score,peer_social_min,peer_social_avg,peer_social_max,governance_score,peer_governance_min,peer_governance_avg,peer_governance_max
0,AAAAX,MutualFund,US,DWS RREEF Real Assets Fund - Cl,DWS RREEF Real Assets Fund - Class A,USD,1000.0,50.0,World Allocation,DWS,...,5.05,10.58,7.43,5.98,9.07,11.3,5.43,4.26,7.14,8.11
1,AAAEX,MutualFund,US,AllianzGI Health Sciences Fund,Virtus AllianzGI Health Sciences Fund Class P,USD,1000000.0,,Health,Virtus,...,1.43,3.27,12.96,9.52,12.87,15.08,8.4,4.96,7.68,10.3
2,AAAFX,MutualFund,US,,American Century One Choice Blend+ 2015 Portfo...,USD,2500.0,50.0,Target-Date 2015,American Century Investments,...,,,,,,,,,,
3,AAAGX,MutualFund,US,Thrivent Large Cap Growth Fund,Thrivent Large Cap Growth Fund Class A,USD,2000.0,50.0,Large Growth,Thrivent Funds,...,2.7,5.81,10.13,7.25,10.14,11.97,8.03,5.3,7.54,8.9
4,AAAHX,MutualFund,US,,American Century One Choice Blend+ 2015 Portfo...,USD,5000000.0,,Target-Date 2015,American Century Investments,...,,,,,,,,,,


In [61]:
# Find columns with only 1 value to drop
mutualFunds.loc[: , mutualFunds.dtypes== "object"].nunique()

fund_symbol              23783
quote_type                   1
region                       1
fund_short_name           3044
fund_long_name            6644
currency                     1
fund_category              119
fund_family                310
exchange_code                1
exchange_name                1
exchange_timezone            1
management_name           1388
management_bio            1300
management_start_date     2316
investment_strategy       2245
inception_date            4499
investment_type              3
size_type                    3
top10_holdings            2347
returns_as_of_date          51
esg_peer_group              86
dtype: int64

In [62]:
mutualFunds_counts= mutualFunds.loc[: ,mutualFunds.dtypes=="object"].nunique()
mutualFunds_counts_one= mutualFunds_counts[mutualFunds_counts == 1].index.to_list()
print(mutualFunds_counts_one)

['quote_type', 'region', 'currency', 'exchange_code', 'exchange_name', 'exchange_timezone']


In [63]:
#Drop columns with only 1 unique value
mutualFunds.drop(columns=mutualFunds_counts_one,inplace=True)
mutualFunds

Unnamed: 0,fund_symbol,fund_short_name,fund_long_name,initial_investment,subsequent_investment,fund_category,fund_family,management_name,management_bio,management_start_date,...,peer_environment_avg,peer_environment_max,social_score,peer_social_min,peer_social_avg,peer_social_max,governance_score,peer_governance_min,peer_governance_avg,peer_governance_max
0,AAAAX,DWS RREEF Real Assets Fund - Cl,DWS RREEF Real Assets Fund - Class A,1000.0,50.0,World Allocation,DWS,John Vojticek,Co-Head of Liquid Real Assets / Chief Investme...,2015-01-15,...,5.05,10.58,7.43,5.98,9.07,11.30,5.43,4.26,7.14,8.11
1,AAAEX,AllianzGI Health Sciences Fund,Virtus AllianzGI Health Sciences Fund Class P,1000000.0,,Health,Virtus,Christopher Chin,,2020-08-27,...,1.43,3.27,12.96,9.52,12.87,15.08,8.40,4.96,7.68,10.30
2,AAAFX,,American Century One Choice Blend+ 2015 Portfo...,2500.0,50.0,Target-Date 2015,American Century Investments,Scott A. Wilson,"Mr. Wilson, Vice President and Portfolio Manag...",2021-03-10,...,,,,,,,,,,
3,AAAGX,Thrivent Large Cap Growth Fund,Thrivent Large Cap Growth Fund Class A,2000.0,50.0,Large Growth,Thrivent Funds,Lauri Brunner,Ms. Brunner has been with Thrivent Financial s...,2018-09-30,...,2.70,5.81,10.13,7.25,10.14,11.97,8.03,5.30,7.54,8.90
4,AAAHX,,American Century One Choice Blend+ 2015 Portfo...,5000000.0,,Target-Date 2015,American Century Investments,Scott A. Wilson,"Mr. Wilson, Vice President and Portfolio Manag...",2021-03-10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23778,ZVNIX,Fidelity Advisor Small Cap Valu,Fidelity Advisor Small Cap Value Fund Class M,50000.0,500.0,Large Growth,Fidelity Investments,Derek Janssen,Derek Janssen is portfolio manager of the Fide...,2015-08-31,...,2.70,5.81,10.53,7.25,10.14,11.97,7.25,5.30,7.54,8.90
23779,VHYAX,Capital World Bond Fund - Class,American Funds Capital World Bond Fund Class 5...,3000.0,1.0,Large Value,American Funds,Thomas HÃ¸gh,Thomas H. HÃ¸gh is a fixed income portfolio ma...,2016-02-25,...,5.00,9.11,10.83,7.04,10.48,12.42,7.84,5.83,7.71,9.30
23780,VIAAX,Capital World Growth and Income,American Funds Capital World Growth and Income...,3000.0,1.0,Foreign Large Growth,American Funds,Michael Alfonso Cohen,Michael Cohen is an equity portfolio manager a...,2016-02-25,...,3.87,6.22,8.53,7.25,8.96,11.96,7.70,5.79,7.42,8.78
23781,VIHAX,Templeton China World Cl R6,Templeton China World Fund Class R6,3000.0,1.0,Foreign Large Value,Franklin Templeton Investments,Michael B. Lai,He joined Franklin Templeton in August 2019. P...,2016-02-25,...,5.35,11.49,9.55,6.59,9.24,11.66,8.79,5.79,8.10,10.85


In [64]:
# Remove NaN rows and set new index
mutualFunds.dropna(axis=0, inplace=True)
mutualFunds

Unnamed: 0,fund_symbol,fund_short_name,fund_long_name,initial_investment,subsequent_investment,fund_category,fund_family,management_name,management_bio,management_start_date,...,peer_environment_avg,peer_environment_max,social_score,peer_social_min,peer_social_avg,peer_social_max,governance_score,peer_governance_min,peer_governance_avg,peer_governance_max
