In [None]:
# Stock classification using k-means clustering

### Introducing the k-means algorithm
The term k-means was first used by MacQueen in 1967, although the idea dates back to Steinhaus in 1957. K-means is an unsupervised classification (clustering) algorithm that groups objects into k groups based on their characteristics.

Clustering is done by minimizing the sum of distances between each object and the centroid of its group or cluster. Quadratic distance is often used. The algorithm consists of three steps:

1. Initialization: once the number of groups, k, has been chosen, k centroids are established in the data space, for example, choosing them randomly.
2. Assign objects to centroids: each data object is assigned to its nearest centroid.
3. Centroid update: the position of the centroid of each group is updated, taking as the new centroid the position of the average of the objects belonging to said group.

    Steps 2 and 3 are repeated until the centroids do not move, or move below a threshold distance at each step.

### Clustering of stocks by return and volatility
We analyze the S&P 500 index to cluster stocks based on return and volatility. This index comprises 500 large-cap US companies from various sectors, traded on NYSE or Nasdaq. Due to its representation of the US’s largest publicly traded firms, it serves as a suitable dataset for algorithmic k-means clustering.

In [1]:
#Import the libraries that we are going to need to carry out the analysis:
import numpy as np 
import pandas as pd
import pandas_datareader as dr
import yfinance as yf

from pylab import plot,show
from matplotlib import pyplot as plt
import plotly.express as px

from numpy.random import rand
from scipy.cluster.vq import kmeans,vq
from math import sqrt
from sklearn.cluster import KMeans 
from sklearn import preprocessing

Then, we create a null matrix with different combinations of moving averages using a Pandas DataFrame:

### Load Data
We calculate the annual average return and volatility for each company by obtaining their adjusted closing prices during 01/02/2020–12/02/2022 and inserting them into a dataframe, which is then annualized (assuming 252 market days per year).

In [5]:
# Define the url
sp500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Read in the url and scrape ticker data
data_table = pd.read_html(sp500_url)
tickers = data_table[0]['Symbol'].values.tolist()
tickers = [s.replace('\n', '') for s in tickers]
tickers = [s.replace('.', '-') for s in tickers]
tickers = [s.replace(' ', '') for s in tickers]

# Download prices
prices_list = []
print(prices_list)
for ticker in tickers:
    try:
        prices = dr.DataReader(ticker,'yahoo','01/01/2020')['Adj Close']
        prices = pd.DataFrame(prices)
        prices.columns = [ticker]
        prices_list.append(prices)
    except:
        pass
    prices_df = pd.concat(prices_list,axis=1)
prices_df.sort_index(inplace=True)



# Create an empity dataframe
returns = pd.DataFrame()

# Define the column Returns
returns['Returns'] = prices_df.pct_change().mean() * 252

# Define the column Volatility
returns['Volatility'] = prices_df.pct_change().std() * sqrt(252)

[]


In [14]:
import datetime
import yfinance as yf

# Read tickers from the file
with open('nifty50_symbols.txt', 'r') as file:
    tickers = [line.strip() for line in file]

print(tickers)


['ADANIPORTS.NS', 'ASIANPAINT.NS', 'AXISBANK.NS', 'BAJAJ-AUTO.NS', 'BAJFINANCE.NS', 'BAJAJFINSV.NS', 'BHARTIARTL.NS', 'BRITANNIA.NS', 'CIPLA.NS', 'COALINDIA.NS', 'DIVISLAB.NS', 'DRREDDY.NS', 'EICHERMOT.NS', 'GRASIM.NS', 'HCLTECH.NS', 'HDFC.NS', 'HDFCBANK.NS', 'HEROMOTOCO.NS', 'HINDALCO.NS', 'HINDUNILVR.NS', 'ICICIBANK.NS', 'IOC.NS', 'INDUSINDBK.NS', 'INFY.NS', 'ITC.NS', 'JSWSTEEL.NS', 'KOTAKBANK.NS', 'LT.NS', 'M&M.NS', 'MARUTI.NS', 'NESTLEIND.NS', 'NTPC.NS', 'ONGC.NS', 'POWERGRID.NS', 'RELIANCE.NS', 'SHREECEM.NS', 'SBIN.NS', 'SBILIFE.NS', 'SUNPHARMA.NS', 'TCS.NS', 'TATACONSUM.NS', 'TATAMOTORS.NS', 'TATASTEEL.NS', 'TECHM.NS', 'TITAN.NS', 'ULTRACEMCO.NS', 'UBL.NS', 'WIPRO.NS']


In [21]:
# Download prices
prices_list = []
for ticker in tickers:
    try:
        prices = dr.DataReader(ticker,'yahoo','01/01/2020')['Adj Close']
        prices = pd.DataFrame(prices)
        prices.columns = [ticker]
        prices_list.append(prices)
    finally:
        print(prices_list)
#     except:
#         pass
#     prices_df = pd.concat(prices_list,axis=1)
# prices_df.sort_index(inplace=True)

# prices_df.head()



[]


TypeError: string indices must be integers