## Import statements

In [35]:
# Import Statements

import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

In [3]:
# Importing the warnings package to suppress the warnings
import warnings
warnings.filterwarnings("ignore")

## Load the csv data

In [26]:
# Load the data into a Pandas DataFrame
stock_data_df = pd.read_csv(
    Path("./Data_Files/stock_data.csv"),
    index_col="ticker"
)

In [27]:
# Review the csv data
stock_data_df.head()

Unnamed: 0_level_0,year,gross_profit_margin,ebit,return_on_sales,return_on_assets,return_on_equity,ebitda,eps
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BYDIY,2022,-94485750000.0,7854485000.0,0.073279,0.032593,0.072464,4937601000.0,41.0
ONTO,2023,-70950000.0,261100000.0,0.320027,0.063444,0.069771,193888000.0,2.47
COHR,2023,-1923534000.0,732102000.0,0.141877,-0.018923,-0.052021,50415000.0,-1.89
BYDIF,2022,-94485750000.0,7854485000.0,0.073279,0.032593,0.072464,4937601000.0,0.82
OLED,2023,305677000.0,303993000.0,0.527373,0.121639,0.140276,260591000.0,0.0043


## Prepare the data

Scale the data and make a new DataFrame

In [28]:
# Make a copy of the stock_data_df that contains all scaleable values
stock_data_to_be_scaled = stock_data_df.drop(columns=['year'])

In [14]:
# Review the dataframe
# stock_data_to_be_scaled.head()

Unnamed: 0_level_0,gross_profit_margin,ebit,return_on_sales,return_on_assets,return_on_equity,ebitda,eps
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BYDIY,-94485750000.0,7854485000.0,0.073279,0.032593,0.072464,4937601000.0,41.0
ONTO,-70950000.0,261100000.0,0.320027,0.063444,0.069771,193888000.0,2.47
COHR,-1923534000.0,732102000.0,0.141877,-0.018923,-0.052021,50415000.0,-1.89


In [29]:
# Use StandardScaler() to scale the data
scaled_data = StandardScaler().fit_transform(stock_data_to_be_scaled)

# Create a dataframe with the scaled_data
stock_data_scaled_df = pd.DataFrame(
    scaled_data,
    columns = stock_data_to_be_scaled.columns
)

In [30]:
# Copy the index values from the original data and set those values to be the index of the scaled dataframe
stock_data_scaled_df['ticker'] = stock_data_df.index
stock_data_scaled_df = stock_data_scaled_df.set_index('ticker')

# Add the year data from the original dataset
# stock_data_scaled_df['year'] = stock_data_df['year']

In [31]:
# Review the data
stock_data_scaled_df

Unnamed: 0_level_0,gross_profit_margin,ebit,return_on_sales,return_on_assets,return_on_equity,ebitda,eps
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BYDIY,-0.150833,0.09169,0.050301,-0.239918,0.0837,0.10766,0.127035
ONTO,-0.119761,0.070826,0.230962,0.18742,0.072785,0.094481,0.087592
COHR,-0.120371,0.07212,0.100527,-0.953512,-0.420803,0.094083,0.083128
BYDIF,-0.150833,0.09169,0.050301,-0.239918,0.0837,0.10766,0.085903
OLED,-0.119637,0.070944,0.382774,0.993537,0.358524,0.094667,0.085068
MKSI,-0.120048,0.073326,0.23336,-3.488196,-3.228205,0.096093,0.05685
OMRNY,-0.148802,0.486603,0.123331,0.333607,0.200936,0.441183,0.466075
AMKR,-0.121257,0.074871,0.191798,0.044687,0.158048,0.097004,0.086558
SOTGY,-0.126315,0.08839,0.143391,0.084224,0.236858,0.107035,0.107615
FN,-0.120387,0.071091,0.095555,1.043288,0.474134,0.094814,0.092014


## K-means using the Original Data

### Find the best value for k

In [24]:
# Create a list with the number of k-values to try
# Use a range from 1 to 11
k = list(range(1,11))

In [25]:
# Create an empty list to store the inertia values
inertia = []

In [32]:
# Create a loop to compute the inertia for each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(stock_data_scaled_df)
    inertia.append(k_model.inertia_)

In [33]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow =  pd.DataFrame(elbow_data)

# Review the dataframe
df_elbow

Unnamed: 0,k,inertia
0,1,343.0
1,2,145.423561
2,3,86.180925
3,4,70.601998
4,5,19.650797
5,6,16.30426
6,7,10.819423
7,8,10.660169
8,9,6.866002
9,10,6.075525


In [36]:
# Plot a line graph with all the inertia values to visually identify the optimal value for k
elbow_curve_original_data = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve for the Original Data", 
    xticks=k
)

elbow_curve_original_data

### Build the K-Means model with 5 clusters

From above, the optimal number of clusters is probably 3 or 5, so we're going to build the mode with 5 clusters.

In [37]:
# Initialize the K-Means model using the best value for k
# Note that a fixed random state value is used to get repeatable results
model = KMeans(n_clusters=5, random_state=0)

In [38]:
# Fit the K-Means model using the scaled data
model.fit(stock_data_scaled_df)

In [39]:
# Predict the clusters to group the stocks using the scaled data
kmeans_predictions = model.predict(stock_data_scaled_df)

# View the resulting array of cluster values
print(kmeans_predictions)

[3 3 3 3 0 2 3 3 3 0 3 3 0 3 3 3 3 0 3 3 3 0 3 0 0 3 0 3 0 3 0 3 0 0 2 3 1
 3 3 3 3 3 4 3 3 3 3 3 3]


In [40]:
# Create a copy of the stock dataframe
stock_data_scaled_with_predictions_df = stock_data_scaled_df.copy()

# Add a new column with the predicted clusters
stock_data_scaled_with_predictions_df['cluster'] = kmeans_predictions

# Review the dataframe
stock_data_scaled_with_predictions_df.head()

Unnamed: 0_level_0,gross_profit_margin,ebit,return_on_sales,return_on_assets,return_on_equity,ebitda,eps,cluster
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BYDIY,-0.150833,0.09169,0.050301,-0.239918,0.0837,0.10766,0.127035,3
ONTO,-0.119761,0.070826,0.230962,0.18742,0.072785,0.094481,0.087592,3
COHR,-0.120371,0.07212,0.100527,-0.953512,-0.420803,0.094083,0.083128,3
BYDIF,-0.150833,0.09169,0.050301,-0.239918,0.0837,0.10766,0.085903,3
OLED,-0.119637,0.070944,0.382774,0.993537,0.358524,0.094667,0.085068,0


In [47]:
# Create a scatter plot using hvPlot
# Color the graph points with the labels found using K-Means
# Add the ticker in the `hover_cols` parameter to identify the stock represented by each data point
scatter_plot_original_data = stock_data_scaled_with_predictions_df.hvplot.scatter(
    x="gross_profit_margin",
    y="eps",
    by="cluster",
    hover_cols="ticker"
).opts(
    yformatter="%.0f",
    title='Scatter Plot of the Clusters Based on the Original Data'
)

scatter_plot_original_data