In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC as svc
from sklearn import preprocessing
from sklearn.cluster import KMeans as kgroup

In [2]:
df = pd.read_csv("stock_XY_train.csv")
x = df.iloc[:, 0:-1]
y = df.iloc[:,-1]

#organize dataframe
sector=x.loc[:, "Sector"]
x.drop(labels=["Sector"], axis=1, inplace=True)
x.insert(2,"Sector",sector)
x = x.drop(['Unnamed: 0'], axis=1)

#making dummy val for string data
sectors=x["Sector"].unique()
sectorsDict=dict(zip(sectors,range(1,len(sectors)+1)))
stonks=x["Ticker"].unique()
stonksDict=dict(zip(stonks,range(1,len(stonks)+1)))

x["Ticker"]=x["Ticker"].map(stonksDict)
x["Sector"]=x["Sector"].map(sectorsDict)

x.tail()

Unnamed: 0,Ticker,Sector,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,...,5Y Dividend per Share Growth (per Share),3Y Dividend per Share Growth (per Share),Receivables growth,Inventory Growth,Asset Growth,Book Value per Share Growth,Debt Growth,R&D Expense Growth,SG&A Expenses Growth,Yr
12374,4472,3,,,,,,,,,...,,,,,,,,,,16
12375,4831,3,4891000.0,,,4675000.0,,,,4072000.0,...,,,,,,,,,,16
12376,4221,6,9115000000.0,0.0354,5518600000.0,3596400000.0,0.0,983100000.0,2036400000.0,1560000000.0,...,0.0826,0.0843,0.0349,0.0,0.022,0.0229,0.0668,0.0,0.0699,15
12377,4715,1,164995000.0,0.0,0.0,164995000.0,143633000.0,29285000.0,172918000.0,-7923000.0,...,0.0,0.0,0.0,0.0,0.6,0.0274,0.0,0.4398,0.049,14
12378,3052,3,6532229000.0,0.1777,0.0,6532229000.0,0.0,3008006000.0,3144757000.0,3387472000.0,...,0.0778,0.0717,0.0,0.0,0.1964,0.1946,-0.0063,0.0,0.1417,17


In [3]:
#narrow down to top 20 cols based on amount of nulls
empAmount=x.isnull().sum(axis=0).tolist()
empAmount.sort()
top_20=empAmount[20]

#selecting columns to use based on numbers of nulls 
Nempty_cols=np.array(x.isnull().sum(axis=0).tolist())
useCol=np.where(Nempty_cols<=top_20)[0]
x=x.iloc[:,useCol]

#filling in null values based on mean of each cols
cols=x.columns.tolist()
col_mean=x.loc[:,cols].mean(axis=0).tolist()
replaceDict=dict(zip(cols,col_mean))
x=x.fillna(value=replaceDict)

x.tail()

Unnamed: 0,Ticker,Sector,Revenue,Gross Profit,Operating Income,Net Income Com,EPS,EPS Diluted,Gross Margin,Earnings Before Tax Margin,...,Receivables,Total assets,Total liabilities,Retained earnings (deficit),Total shareholders equity,Operating Cash Flow,Investing Cash flow,Financing Cash Flow,Net cash flow / Change in cash,Yr
12374,4472,3,5002385000.0,1915068000.0,579177600.0,376628200.0,-10452.598897,-10578.98441,0.479493,-7.079486,...,967498900.0,21330210000.0,16900890000.0,2018798000.0,3586480000.0,1040167000.0,-700332700.0,4090554.0,155712000.0,16
12375,4831,3,4891000.0,4675000.0,4072000.0,376628200.0,0.83,0.83,0.9558,0.8186,...,1431000.0,21330210000.0,16900890000.0,102000.0,3586480000.0,2000000.0,-700332700.0,-3000000.0,-1000000.0,16
12376,4221,6,9115000000.0,3596400000.0,1560000000.0,749900000.0,2.14,2.13,0.3946,0.1311,...,962900000.0,20535900000.0,12759300000.0,3138300000.0,7774100000.0,1679700000.0,-1482800000.0,-239700000.0,-42800000.0,15
12377,4715,1,164995000.0,164995000.0,-7923000.0,-17416000.0,-0.36,-0.36,1.0,-0.1044,...,3006000.0,369144000.0,159672000.0,-467212000.0,209472000.0,117715000.0,117853000.0,3723000.0,239291000.0,14
12378,3052,3,6532229000.0,6532229000.0,3387472000.0,2164161000.0,1.2771,1.2618,1.0,0.5181,...,0.0,139631600000.0,123801900000.0,5612873000.0,15801240000.0,5840798000.0,-24304370000.0,19311260000.0,816769100.0,17


In [4]:
#scaling features
scaler=preprocessing.MaxAbsScaler()
x_scaled=scaler.fit_transform(x)
num_row=x.shape[0]
num_col=x.shape[1]
print(f'rows={num_row}')
print(f'columns={num_col}')
print(x_scaled.shape)
print(x_scaled[0].shape)
print(x_scaled[0:3])

rows=12379
columns=21
(12379, 21)
(21,)
[[ 2.06996481e-04  9.09090909e-02  1.39235958e-06  1.42807686e-07
  -7.14577718e-05 -4.84108771e-05 -2.06339596e-07 -2.06339596e-07
   3.49842773e-04 -4.97874135e-04  1.46974220e-05  9.49397590e-07
   6.33786063e-07  1.34228441e-07 -4.33444300e-05  7.10228726e-06
  -7.18541667e-06 -5.21978022e-07  2.46249855e-05  5.78238866e-06
   8.82352941e-01]
 [ 4.13992962e-04  1.81818182e-01  1.01310892e-02  1.39670244e-02
   2.88079513e-02  8.38899644e-03  2.14977981e-08  2.13996346e-08
   4.70269419e-03  1.81708331e-05  9.63364993e-04  4.69156627e-03
   3.45006174e-03  2.63355986e-03  2.31799103e-02  1.42768937e-02
   6.05729167e-03 -3.52087912e-02  1.24811039e-03  5.16194332e-05
   8.23529412e-01]
 [ 6.20989443e-04  2.72727273e-01  1.92532937e-04  3.28511771e-04
  -9.16297252e-05 -4.92912302e-05 -1.57061539e-09 -1.57061539e-09
   5.82084552e-03 -3.66986133e-06  1.55799186e-04  0.00000000e+00
   4.89287557e-05  3.93198841e-05 -5.37542819e-04  1.78781832e-0

In [5]:
#reorganized data again  and turn it into a dataframe
values=[]
for i in range(len(cols)):
    values.append(x_scaled[:,i])
print(values[0])
data=dict(zip(cols,values))
x_clean=pd.DataFrame(data)
print(x_clean.info())
x_clean.head()

[2.06996481e-04 4.13992962e-04 6.20989443e-04 ... 8.73732147e-01
 9.75988408e-01 6.31753260e-01]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12379 entries, 0 to 12378
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Ticker                          12379 non-null  float64
 1   Sector                          12379 non-null  float64
 2   Revenue                         12379 non-null  float64
 3   Gross Profit                    12379 non-null  float64
 4   Operating Income                12379 non-null  float64
 5   Net Income Com                  12379 non-null  float64
 6   EPS                             12379 non-null  float64
 7   EPS Diluted                     12379 non-null  float64
 8   Gross Margin                    12379 non-null  float64
 9   Earnings Before Tax Margin      12379 non-null  float64
 10  Cash and cash equivalents       12379 non-null  float64
 

Unnamed: 0,Ticker,Sector,Revenue,Gross Profit,Operating Income,Net Income Com,EPS,EPS Diluted,Gross Margin,Earnings Before Tax Margin,...,Receivables,Total assets,Total liabilities,Retained earnings (deficit),Total shareholders equity,Operating Cash Flow,Investing Cash flow,Financing Cash Flow,Net cash flow / Change in cash,Yr
0,0.000207,0.090909,1e-06,1.428077e-07,-7.1e-05,-4.8e-05,-2.063396e-07,-2.063396e-07,0.00035,-0.000498,...,9.493976e-07,6.337861e-07,1.342284e-07,-4.3e-05,7e-06,-7e-06,-5.21978e-07,2.5e-05,6e-06,0.882353
1,0.000414,0.181818,0.010131,0.01396702,0.028808,0.008389,2.14978e-08,2.139963e-08,0.004703,1.8e-05,...,0.004691566,0.003450062,0.00263356,0.02318,0.014277,0.006057,-0.03520879,0.001248,5.2e-05,0.823529
2,0.000621,0.272727,0.000193,0.0003285118,-9.2e-05,-4.9e-05,-1.570615e-09,-1.570615e-09,0.005821,-4e-06,...,0.0,4.892876e-05,3.931988e-05,-0.000538,0.000179,0.000219,-0.0008655714,-0.000121,-9e-06,1.0
3,0.000828,0.272727,1.9e-05,7.340531e-05,9e-05,3.9e-05,6.576952e-09,6.576952e-09,0.013455,4.6e-05,...,0.0,4.813635e-05,4.358869e-05,0.000103,0.000116,1.7e-05,-0.000248978,5.9e-05,2e-06,0.882353
4,0.001035,0.363636,0.00171,0.003019653,0.003532,0.001731,4.976888e-08,4.957255e-08,0.006025,1.9e-05,...,0.001326887,0.0001973267,0.0001285108,0.009211,0.00111,0.000573,-0.001953093,-6.7e-05,0.000144,0.941176


In [40]:
#organizing my data into clusters
clusters=kgroup(n_clusters=3).fit(x_clean)
groups=clusters.labels_

In [43]:
print(len(groups))
print(np.unique(groups))
group0=np.where(groups==0)
group1=np.where(groups==1)
group2=np.where(groups==2)
print(group0)
print(len(group0[0]))
print(len(group1[0]))
print(len(group2[0]))

12379
[0 1 2]
(array([ 2917,  2937,  2945, ..., 12376, 12377, 12378]),)
4675
3488
4216


In [51]:

x0=x_clean.iloc[group0[0]]
x1=x_clean.iloc[group1[0]]
x2=x_clean.iloc[group2[0]]

y0=y.iloc[group0[0]]
y1=y.iloc[group1[0]]
y2=y.iloc[group2[0]]

x0_train=x0.iloc[:int(len(x0)/2)]
x1_train=x1.iloc[:int(len(x1)/2)]
x2_train=x2.iloc[:int(len(x2)/2)]

x0_test=x0.iloc[int(len(x0)/2):]
x1_test=x1.iloc[int(len(x1)/2):]
x2_test=x2.iloc[int(len(x2)/2):]

y0_train=y0.iloc[:int(len(x0)/2)]
y1_train=y1.iloc[:int(len(x1)/2)]
y2_train=y2.iloc[:int(len(x2)/2)]

y0_test=y0.iloc[int(len(x0)/2):]
y1_test=y1.iloc[int(len(x1)/2):]
y2_test=y2.iloc[int(len(x2)/2):]

In [None]:
def clean(x,y):

In [69]:
def makeModel(x,y,c=1,ker="rbf",n=3):
    models=[]
    if(ker=="poly"):
        for i in range(len(x)):
            clf=svc(C=c,kernel=ker,degree=n)
            model=clf.fit(x[i],y[i])
            print(model.score(x[i],y[i]))
            models.append(model)
    else:
        for i in range(len(x)):
            clf=svc(C=c,kernel=ker)
            model=clf.fit(x[i],y[i])
            print(model.score(x[i],y[i]))
            models.append(model)
    return models

def modelTest(x,y,models):
    for i in range(len(models)):
        print(f"cluster{i}={models[i].score(x[i],y[i])}")

In [70]:
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=1)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5447154471544715
0.5538990825688074
0.642314990512334
cluster0=0.5447154471544715
cluster1=0.5538990825688074
cluster2=0.642314990512334


In [71]:
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=2)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5464270432178006
0.5533256880733946
0.6461100569259962
cluster0=0.5464270432178006
cluster1=0.5533256880733946
cluster2=0.6461100569259962


In [72]:
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=3)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5507060333761232
0.5538990825688074
0.6461100569259962
cluster0=0.5507060333761232
cluster1=0.5538990825688074
cluster2=0.6461100569259962


In [73]:
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=4)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.553701326486949
0.5567660550458715
0.6451612903225806
cluster0=0.553701326486949
cluster1=0.5567660550458715
cluster2=0.6451612903225806


In [74]:
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=6)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5631151048352588
0.5567660550458715
0.6513282732447818
cluster0=0.5631151048352588
cluster1=0.5567660550458715
cluster2=0.6513282732447818


In [77]:
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=9)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.6050492083868207
0.5602064220183486
0.655123339658444
cluster0=0.6050492083868207
cluster1=0.5602064220183486
cluster2=0.655123339658444


In [79]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=1,ker="poly")
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5464270432178006
0.5533256880733946
0.5592979127134725
cluster0=0.5464270432178006
cluster1=0.5533256880733946
cluster2=0.5592979127134725
CPU times: user 803 ms, sys: 0 ns, total: 803 ms
Wall time: 800 ms


In [80]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=3,ker="poly")
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5498502353444588
0.555045871559633
0.6162239089184061
cluster0=0.5498502353444588
cluster1=0.555045871559633
cluster2=0.6162239089184061
CPU times: user 801 ms, sys: 4 ms, total: 805 ms
Wall time: 802 ms


In [83]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=8,ker="poly")
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.5515618314077878
0.5561926605504587
0.6446869070208728
cluster0=0.5515618314077878
cluster1=0.5561926605504587
cluster2=0.6446869070208728
CPU times: user 811 ms, sys: 0 ns, total: 811 ms
Wall time: 810 ms


In [85]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=8,ker="poly",n=8)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.6337184424475824
0.599197247706422
0.6427893738140418
cluster0=0.6337184424475824
cluster1=0.599197247706422
cluster2=0.6427893738140418
CPU times: user 1.13 s, sys: 0 ns, total: 1.13 s
Wall time: 1.12 s


In [86]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=8,ker="poly",n=10)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.6401369276850664
0.6215596330275229
0.6503795066413662
cluster0=0.6401369276850664
cluster1=0.6215596330275229
cluster2=0.6503795066413662
CPU times: user 1.39 s, sys: 0 ns, total: 1.39 s
Wall time: 1.38 s


In [87]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=8,ker="poly",n=16)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.6499786050492083
0.6290137614678899
0.6612903225806451
cluster0=0.6499786050492083
cluster1=0.6290137614678899
cluster2=0.6612903225806451
CPU times: user 8.43 s, sys: 3.76 ms, total: 8.43 s
Wall time: 8.43 s


In [88]:
%%time
models=makeModel([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],c=8,ker="poly",n=num_col+1)
modelTest([x0_train,x1_train,x2_train], [y0_train,y1_train,y2_train],models)

0.660676080445015
0.6387614678899083
0.6665085388994307
cluster0=0.660676080445015
cluster1=0.6387614678899083
cluster2=0.6665085388994307
CPU times: user 2min 6s, sys: 0 ns, total: 2min 6s
Wall time: 2min 6s
