In [1]:
from zipfile import ZipFile
zip = ZipFile('Resources/archive.zip')
zip.extractall('Resources')

In [2]:
import pandas as pd
import hvplot.pandas
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import findspark
from pyspark.sql import SparkSession


In [3]:
#Display all the columns (to see which to drop)
pd.set_option("display.max_columns", None)

## Read In CSV data for Mutual Fund prices A-Z


In [4]:
# Initialize spark
findspark.init()

# Create a SparkSession
#spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
spark = SparkSession.builder.appName("SparkSQL").config("spark.driver.memory", "2g").getOrCreate()

In [5]:
#Read in CSV data 
#MutualFund prices A-E
df_AE= pd.read_csv(
    Path("Resources/MutualFund Prices - A-E.csv")
)
df_AE.head()

Unnamed: 0,fund_symbol,price_date,nav_per_share
0,AAAAX,2007-07-31,10.02
1,AAAAX,2007-08-01,9.98
2,AAAAX,2007-08-02,10.01
3,AAAAX,2007-08-03,9.9
4,AAAAX,2007-08-06,9.93


In [6]:
#Read in CSV data 
#MutualFund prices A-E
AE_path = "Resources/MutualFund Prices - A-E.csv"
df_AE= spark.read.csv(AE_path, header=True)
df_AE.head(5)

[Row(fund_symbol='AAAAX', price_date='2007-07-31', nav_per_share='10.02'),
 Row(fund_symbol='AAAAX', price_date='2007-08-01', nav_per_share='9.98'),
 Row(fund_symbol='AAAAX', price_date='2007-08-02', nav_per_share='10.01'),
 Row(fund_symbol='AAAAX', price_date='2007-08-03', nav_per_share='9.9'),
 Row(fund_symbol='AAAAX', price_date='2007-08-06', nav_per_share='9.93')]

In [7]:
#Read in CSV data 
#MutualFund prices F-K
df_FK= pd.read_csv(
    Path("Resources/MutualFund Prices - F-K.csv")
)
df_FK.head()

Unnamed: 0,fund_symbol,price_date,nav_per_share
0,FAAAX,2013-11-20,10.08
1,FAAAX,2013-11-21,10.13
2,FAAAX,2013-11-22,10.17
3,FAAAX,2013-11-25,10.18
4,FAAAX,2013-11-26,10.2


# Read in Mutual Fund informaiton csv


If we drop rows that have an `NaN`, then all data is removed. Instead, let's look at the columns that have missing data.

We plan to find the number of columns that have above a certain percentage of `NaN` and then remove those columns. We have chosen 60% so far.

# Binning!

Binning `fund_category`

Binning `fund_family_type_count`

Binning `esg_peer_group`

# Merge mutualFunds and mutual_fund_df on index= "fund_symbol"

## Read In CSV data for ETF prices and ETF information


In [26]:
# Read in ETF data 
# ETF Prices
ETF_prices_path = 'Resources/ETF prices.csv'
df_ETF_prices= spark.read.csv(ETF_prices_path, header=True, inferSchema=True)

# ETF Prices
ETFs_path = 'Resources/ETFs.csv'
df_ETFs= spark.read.csv(ETFs_path, header=True, inferSchema=True)


In [40]:
df_ETF_prices.head(5)

[Row(fund_symbol='AAA', price_date=datetime.date(2020, 9, 9), open=25.1, high=25.12, low=25.07, close=25.07, adj_close=24.85, volume=17300),
 Row(fund_symbol='AAA', price_date=datetime.date(2020, 9, 10), open=25.06, high=25.07, low=25.05, close=25.07, adj_close=24.85, volume=23500),
 Row(fund_symbol='AAA', price_date=datetime.date(2020, 9, 11), open=25.04, high=25.05, low=25.02, close=25.03, adj_close=24.81, volume=33400),
 Row(fund_symbol='AAA', price_date=datetime.date(2020, 9, 14), open=25.01, high=25.06, low=25.01, close=25.02, adj_close=24.8, volume=13100),
 Row(fund_symbol='AAA', price_date=datetime.date(2020, 9, 15), open=25.02, high=25.03, low=25.01, close=25.01, adj_close=24.79, volume=12100)]

In [28]:
df_ETFs.head(5)

[Row(fund_symbol='AAAU', quote_type='ETF', region='US', fund_short_name='DWS RREEF Real Assets Fund - Cl', fund_long_name='DWS RREEF Real Assets Fund - Class A', currency='USD', fund_category=None, fund_family='DWS', exchange_code='PCX', exchange_name='NYSEArca', exchange_timezone='America/New_York', avg_vol_3month=239238, avg_vol_10day=255410, total_net_assets=384448608, day50_moving_average=17.807, day200_moving_average=17.818, week52_high_low_change=2.75, week52_high_low_change_perc=0.14146, week52_high=19.44, week52_high_change=-1.72, week52_high_change_perc=-0.08848, week52_low=16.69, week52_low_change=1.03, week52_low_change_perc=0.06171, investment_strategy='"The investment seeks total return in excess of inflation through capital growth and current income. The fund will invest at least 80% of its net assets, plus the amount of any borrowings for investment purposes, in a combination of investments that the Advisor believes offer exposure to ""real assets."" It generally invests

In [29]:
df_ETF_prices.count()

3866030

In [33]:
df_ETFs.count()

2310

In [13]:
# Create temporary views for each ETF file/datafram
df_ETF_prices.createOrReplaceTempView('etfprices')
df_ETFs.createOrReplaceTempView('etfs')

In [22]:
spark.sql('SHOW COLUMNS in etfs').show()

+--------------------+
|            col_name|
+--------------------+
|         fund_symbol|
|          quote_type|
|              region|
|     fund_short_name|
|      fund_long_name|
|            currency|
|       fund_category|
|         fund_family|
|       exchange_code|
|       exchange_name|
|   exchange_timezone|
|      avg_vol_3month|
|       avg_vol_10day|
|    total_net_assets|
|day50_moving_average|
|day200_moving_ave...|
|week52_high_low_c...|
|week52_high_low_c...|
|         week52_high|
|  week52_high_change|
+--------------------+
only showing top 20 rows



In [17]:
spark.sql('SHOW COLUMNS in etfprices').show()

+-----------+
|   col_name|
+-----------+
|fund_symbol|
| price_date|
|       open|
|       high|
|        low|
|      close|
|  adj_close|
|     volume|
+-----------+



In [38]:
spark.sql('''
          DESCRIBE table etfs
          '''
).show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|         fund_symbol|   string|   NULL|
|          quote_type|   string|   NULL|
|              region|   string|   NULL|
|     fund_short_name|   string|   NULL|
|      fund_long_name|   string|   NULL|
|            currency|   string|   NULL|
|       fund_category|   string|   NULL|
|         fund_family|   string|   NULL|
|       exchange_code|   string|   NULL|
|       exchange_name|   string|   NULL|
|   exchange_timezone|   string|   NULL|
|      avg_vol_3month|   string|   NULL|
|       avg_vol_10day|   string|   NULL|
|    total_net_assets|   string|   NULL|
|day50_moving_average|   string|   NULL|
|day200_moving_ave...|   string|   NULL|
|week52_high_low_c...|   string|   NULL|
|week52_high_low_c...|   string|   NULL|
|         week52_high|   string|   NULL|
|  week52_high_change|   string|   NULL|
+--------------------+---------+-------+
only showing top

In [39]:
spark.sql('''
          DESCRIBE table etfprices
          '''
).show()

+-----------+---------+-------+
|   col_name|data_type|comment|
+-----------+---------+-------+
|fund_symbol|   string|   NULL|
| price_date|   string|   NULL|
|       open|   string|   NULL|
|       high|   string|   NULL|
|        low|   string|   NULL|
|      close|   string|   NULL|
|  adj_close|   string|   NULL|
|     volume|   string|   NULL|
+-----------+---------+-------+

