Skip to content

Commit 586c550

Browse files
committed
Add k means script
1 parent 1cb5f86 commit 586c550

File tree

9 files changed

+154
-21
lines changed

9 files changed

+154
-21
lines changed

huntsDT/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Hunt's algorithm to build Decision Trees
2+
3+
Build a Decision Tree by splitting based on GINI index. Also outputs the GINI values to explain why the algo chose some attribute.

huntsDT/huntDS.py

Whitespace-only changes.

huntsDT/huntDT_driver.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import argparse
2+
from huntDS import *
3+
4+
def main():
5+
parser = argparse.ArgumentParser(description="Hunt's algorithm walkthrough. By Roundofthree.")
6+
parser.add_argument("-v", action='store_true', help="Print the intermediate GINI indexes.")
7+
parser.add_argument("--table_csv", type=str, help="File path to a .csv file with the records.")
8+
arg = parser.parse_args()
9+
verbose = arg.v
10+
11+
if __name__ == '__main__':
12+
main()

k-means/Kmeans-driver.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from Kmeans import *
2+
import sys
3+
import csv
4+
5+
def main():
6+
if len(sys.argv) < 3:
7+
print("\nError")
8+
print("\nUsage: > Kmeans-driver.py <table.csv>")
9+
sys.exit()
10+
11+
input_file = sys.argv[1]
12+
output_file = sys.argv[2]
13+
records = []
14+
15+
with open(input_file, 'r') as f:
16+
f = csv.reader(f, delimiter=',')
17+
for line in f:
18+
r = [int(i) for i in line]
19+
records.append(r)
20+
21+
n_records = len(records)
22+
n_attributes = len(records[0])
23+
# ask for k
24+
k = map(int, input("Enter k: "))
25+
while k > n_records:
26+
k = map(int, input(f"k must be <= than {n_records}"))
27+
# ask for k initial clusters
28+
print(f"Enter {k} points with {n_attributes}, each separated by a single comma:")
29+
centroids = []
30+
for i in range(k):
31+
input_array = list(map(int, input(k+"-> ").strip().split(",")))
32+
while len(input_array) != n_attributes:
33+
input_array = list(map(int, input("Try again: "+k+"-> ").strip().split(",")))
34+
centroids.append(input_array)
35+
36+
kmeans = KMeans(records=records, k=k, centroids=centroids)
37+
while True:
38+
n_changes = kmeans.run_iteration()
39+
kmeans.print_table()
40+
if n_changes == 0: break
41+
42+
43+
if __name__ == '__main__':
44+
main()
45+

k-means/Kmeans.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import numpy as np
2+
3+
# Euclid distance
4+
def euclid(a, b):
5+
if len(a) != len(b): return
6+
n = len(a)
7+
ret = 0
8+
for i in range(n):
9+
ret += (a[i]-b[i])**2
10+
return sqrt(ret)
11+
12+
# Manhattan distance
13+
def manhattan(a, b):
14+
if len(a) != len(b): return # raise error
15+
n = len(a)
16+
ret = 0
17+
for i in range(n):
18+
ret += abs(a[i] - b[i])
19+
return ret
20+
21+
class KMeans:
22+
def __init__(self, records=[], k=0, centroids=[], metric="EUCLID"):
23+
self.iteration_n = 1
24+
self.records = records
25+
self.k = k
26+
self.centroids = centroids
27+
if metric == "EUCLID":
28+
self.distance = euclid
29+
else:
30+
self.distance = manhattan
31+
for r in records:
32+
r.append(0) # cluster id col
33+
34+
def run_iteration(self):
35+
# print(f"Iteration {self.iteration_n}")
36+
# for each point, compute the distance to each of the centroids
37+
# hard coded Euclidean distance
38+
for p in records:
39+
min_d = np.inf
40+
chosen = 0
41+
for i in range(len(centroids)):
42+
d = self.distance(p[:-1], centroids[i])
43+
if d < min_d:
44+
min_d = d
45+
chosen = i
46+
p[-1] = chosen
47+
# compute the mean k clusters
48+
n_sum = [0 for i in range(len(centroids))]
49+
cummulative_sum = [[0 for j in range(len(centroids[0]))] for i in range(len(centroids))]
50+
for p in records:
51+
for i in range(len(p)):
52+
# update
53+
cummulative_sum[p[-1]][i] += p[i]
54+
n_sum[p[-1]] += 1
55+
for i in range(len(centroids)):
56+
c = centroids[i]
57+
for j in range(len(c)):
58+
c[j] = cummulative_sum[i][j] / n_sum[i]
59+
60+
def print_table(self):
61+
for r in records:
62+
out = "(" + ", ".join(r[:-1]) + ")"
63+
out + " -> "
64+
for c in centroids:
65+
out += self.distance(r[:-1], c)
66+
out += " | "
67+
out += "Cluster "
68+
out += r[-1]

k-means/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# K-Means
2+
3+
Given k and k manually selected initial clusters, run K-Means and display each iteration.
4+
2.32 MB
Loading

watermark_removal/light_watermark_remover.ipynb

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
},
3131
{
3232
"cell_type": "code",
33-
"execution_count": 77,
33+
"execution_count": 85,
3434
"metadata": {},
3535
"outputs": [],
3636
"source": [
@@ -40,19 +40,19 @@
4040
},
4141
{
4242
"cell_type": "code",
43-
"execution_count": 78,
43+
"execution_count": 86,
4444
"metadata": {},
4545
"outputs": [],
4646
"source": [
4747
"# image = \"./images/test1.jpg\"\n",
48-
"image = \"./images/test2-nored.jpg\"\n",
49-
"highlighted_image = \"./images/test2.jpg\"\n",
50-
"output = \"./images/removed2.png\""
48+
"image = \"./images/test3-nored.jpg\"\n",
49+
"highlighted_image = \"./images/test3.jpg\"\n",
50+
"output = \"./images/removed3.png\""
5151
]
5252
},
5353
{
5454
"cell_type": "code",
55-
"execution_count": 79,
55+
"execution_count": 87,
5656
"metadata": {},
5757
"outputs": [],
5858
"source": [
@@ -81,7 +81,7 @@
8181
},
8282
{
8383
"cell_type": "code",
84-
"execution_count": 80,
84+
"execution_count": 89,
8585
"metadata": {},
8686
"outputs": [],
8787
"source": [
@@ -91,7 +91,7 @@
9191
"# test1 \n",
9292
"# x, y, w, h = 600, 1400, 250, 35 \n",
9393
"# test2 \n",
94-
"x, y, w, h = 580, 1480, 470, 70 \n",
94+
"x, y, w, h = 50, 1480, 470, 70 \n",
9595
"# test3 \n",
9696
"# x, y, w, h = 600, 1400, 250, 35 \n",
9797
"\n",
@@ -129,7 +129,7 @@
129129
},
130130
{
131131
"cell_type": "code",
132-
"execution_count": 81,
132+
"execution_count": 90,
133133
"metadata": {},
134134
"outputs": [],
135135
"source": [
@@ -144,7 +144,7 @@
144144
},
145145
{
146146
"cell_type": "code",
147-
"execution_count": 82,
147+
"execution_count": 91,
148148
"metadata": {},
149149
"outputs": [],
150150
"source": [
@@ -173,7 +173,7 @@
173173
},
174174
{
175175
"cell_type": "code",
176-
"execution_count": 83,
176+
"execution_count": 92,
177177
"metadata": {},
178178
"outputs": [],
179179
"source": [
@@ -201,7 +201,7 @@
201201
"\n",
202202
" display(\"Output2\", fI)\n",
203203
"\n",
204-
" # predict the color from the left and top neighbor \n",
204+
" # predict the color from the left and top neighbor \n",
205205
" for row in range(y, y+h):\n",
206206
" for col in range(x, x+w):\n",
207207
" if fI[row, col][0] == 0 and fI[row, col][1] == 0 and fI[row, col][2] == 0:\n",
@@ -211,6 +211,7 @@
211211
"\n",
212212
" display(\"Output test\", fI)\n",
213213
"\n",
214+
" # blur again \n",
214215
" fI[y:y+h, x:x+w, :] = cv.medianBlur(fI[y:y+h, x:x+w, :], 7) \n",
215216
" fI[fI<0] = 0\n",
216217
" fI[fI>255] = 255\n",
@@ -222,7 +223,7 @@
222223
},
223224
{
224225
"cell_type": "code",
225-
"execution_count": 84,
226+
"execution_count": 93,
226227
"metadata": {
227228
"tags": []
228229
},

watermark_removal/watermark_remover.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
},
3131
{
3232
"cell_type": "code",
33-
"execution_count": 14,
33+
"execution_count": 28,
3434
"metadata": {},
3535
"outputs": [],
3636
"source": [
@@ -40,7 +40,7 @@
4040
},
4141
{
4242
"cell_type": "code",
43-
"execution_count": 15,
43+
"execution_count": 29,
4444
"metadata": {},
4545
"outputs": [],
4646
"source": [
@@ -51,7 +51,7 @@
5151
},
5252
{
5353
"cell_type": "code",
54-
"execution_count": 16,
54+
"execution_count": 30,
5555
"metadata": {},
5656
"outputs": [],
5757
"source": [
@@ -80,7 +80,7 @@
8080
},
8181
{
8282
"cell_type": "code",
83-
"execution_count": 17,
83+
"execution_count": 31,
8484
"metadata": {},
8585
"outputs": [],
8686
"source": [
@@ -135,7 +135,7 @@
135135
},
136136
{
137137
"cell_type": "code",
138-
"execution_count": 22,
138+
"execution_count": 32,
139139
"metadata": {},
140140
"outputs": [
141141
{
@@ -164,7 +164,7 @@
164164
},
165165
{
166166
"cell_type": "code",
167-
"execution_count": 23,
167+
"execution_count": 33,
168168
"metadata": {},
169169
"outputs": [],
170170
"source": [
@@ -193,7 +193,7 @@
193193
},
194194
{
195195
"cell_type": "code",
196-
"execution_count": 24,
196+
"execution_count": 34,
197197
"metadata": {},
198198
"outputs": [],
199199
"source": [
@@ -226,7 +226,7 @@
226226
},
227227
{
228228
"cell_type": "code",
229-
"execution_count": 27,
229+
"execution_count": 35,
230230
"metadata": {},
231231
"outputs": [],
232232
"source": [

0 commit comments

Comments
 (0)