Add k means script

RoundofThree · RoundofThree · commit 586c55052f59 · 2021-05-10T00:43:21.000+02:00
diff --git a/huntsDT/README.md b/huntsDT/README.md
@@ -0,0 +1,3 @@
+# Hunt's algorithm to build Decision Trees 
+
+Build a Decision Tree by splitting based on GINI index. Also outputs the GINI values to explain why the algo chose some attribute. 
diff --git a/huntsDT/huntDS.py b/huntsDT/huntDS.py
diff --git a/huntsDT/huntDT_driver.py b/huntsDT/huntDT_driver.py
@@ -0,0 +1,12 @@
+import argparse
+from huntDS import * 
+
+def main():
+    parser = argparse.ArgumentParser(description="Hunt's algorithm walkthrough. By Roundofthree.")
+    parser.add_argument("-v", action='store_true', help="Print the intermediate GINI indexes.")
+    parser.add_argument("--table_csv", type=str, help="File path to a .csv file with the records.")
+    arg = parser.parse_args()
+    verbose = arg.v   
+
+if __name__ == '__main__':
+    main()
diff --git a/k-means/Kmeans-driver.py b/k-means/Kmeans-driver.py
@@ -0,0 +1,45 @@
+from Kmeans import *
+import sys
+import csv
+
+def main():
+    if len(sys.argv) < 3:
+        print("\nError")
+        print("\nUsage: > Kmeans-driver.py <table.csv>")
+        sys.exit()
+
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    records = []
+
+    with open(input_file, 'r') as f:
+        f = csv.reader(f, delimiter=',')
+        for line in f:
+            r = [int(i) for i in line]
+            records.append(r)
+
+    n_records = len(records)
+    n_attributes = len(records[0])
+    # ask for k 
+    k = map(int, input("Enter k: "))
+    while k > n_records:
+        k = map(int, input(f"k must be <= than {n_records}"))
+    # ask for k initial clusters
+    print(f"Enter {k} points with {n_attributes}, each separated by a single comma:")
+    centroids = []
+    for i in range(k):
+        input_array = list(map(int, input(k+"-> ").strip().split(",")))
+        while len(input_array) != n_attributes:
+            input_array = list(map(int, input("Try again: "+k+"-> ").strip().split(",")))
+        centroids.append(input_array)
+    
+    kmeans = KMeans(records=records, k=k, centroids=centroids)
+    while True:
+        n_changes = kmeans.run_iteration()
+        kmeans.print_table()
+        if n_changes == 0: break
+    
+
+if __name__ == '__main__':
+    main()
+    
diff --git a/k-means/Kmeans.py b/k-means/Kmeans.py
@@ -0,0 +1,68 @@
+import numpy as np 
+
+# Euclid distance
+def euclid(a, b):
+    if len(a) != len(b): return
+    n = len(a)
+    ret = 0
+    for i in range(n):
+        ret += (a[i]-b[i])**2
+    return sqrt(ret)
+
+# Manhattan distance
+def manhattan(a, b):
+    if len(a) != len(b): return # raise error 
+    n = len(a)
+    ret = 0
+    for i in range(n):
+        ret += abs(a[i] - b[i])
+    return ret
+
+class KMeans:
+    def __init__(self, records=[], k=0, centroids=[], metric="EUCLID"):
+        self.iteration_n = 1
+        self.records = records
+        self.k = k
+        self.centroids = centroids
+        if metric == "EUCLID":
+            self.distance = euclid
+        else:
+            self.distance = manhattan
+        for r in records:
+            r.append(0)  # cluster id col 
+
+    def run_iteration(self):
+        # print(f"Iteration {self.iteration_n}")
+        # for each point, compute the distance to each of the centroids 
+        # hard coded Euclidean distance 
+        for p in records:
+            min_d = np.inf 
+            chosen = 0
+            for i in range(len(centroids)):
+                d = self.distance(p[:-1], centroids[i])
+                if d < min_d:
+                    min_d = d 
+                    chosen = i
+            p[-1] = chosen
+        # compute the mean k clusters
+        n_sum = [0 for i in range(len(centroids))]
+        cummulative_sum = [[0 for j in range(len(centroids[0]))] for i in range(len(centroids))]
+        for p in records:
+            for i in range(len(p)):
+                # update 
+                cummulative_sum[p[-1]][i] += p[i]
+                n_sum[p[-1]] += 1
+        for i in range(len(centroids)):
+            c = centroids[i]
+            for j in range(len(c)):
+                c[j] = cummulative_sum[i][j] / n_sum[i]
+    
+    def print_table(self):
+        for r in records:
+            out = "(" + ", ".join(r[:-1]) + ")"
+            out + " -> "
+            for c in centroids:
+                out += self.distance(r[:-1], c)
+                out += " | "
+            out += "Cluster "
+            out += r[-1]
diff --git a/k-means/README.md b/k-means/README.md
@@ -0,0 +1,4 @@
+# K-Means 
+
+Given k and k manually selected initial clusters, run K-Means and display each iteration. 
+
diff --git a/watermark_removal/images/removed3.png b/watermark_removal/images/removed3.png
diff --git a/watermark_removal/light_watermark_remover.ipynb b/watermark_removal/light_watermark_remover.ipynb
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,19 +40,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [],
    "source": [
     "# image = \"./images/test1.jpg\"\n",
-    "image = \"./images/test2-nored.jpg\"\n",
-    "highlighted_image = \"./images/test2.jpg\"\n",
-    "output = \"./images/removed2.png\""
+    "image = \"./images/test3-nored.jpg\"\n",
+    "highlighted_image = \"./images/test3.jpg\"\n",
+    "output = \"./images/removed3.png\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 87,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -91,7 +91,7 @@
     "# test1  \n",
     "# x, y, w, h = 600, 1400, 250, 35 \n",
     "# test2 \n",
-    "x, y, w, h = 580, 1480, 470, 70 \n",
+    "x, y, w, h = 50, 1480, 470, 70 \n",
     "# test3 \n",
     "# x, y, w, h = 600, 1400, 250, 35 \n",
     "\n",
@@ -129,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 90,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -173,7 +173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,7 +201,7 @@
     "\n",
     "    display(\"Output2\", fI)\n",
     "\n",
-    "        # predict the color from the left and top neighbor \n",
+    "    # predict the color from the left and top neighbor \n",
     "    for row in range(y, y+h):\n",
     "        for col in range(x, x+w):\n",
     "            if fI[row, col][0] == 0 and fI[row, col][1] == 0 and fI[row, col][2] == 0:\n",
@@ -211,6 +211,7 @@
     "\n",
     "    display(\"Output test\", fI)\n",
     "\n",
+    "    # blur again \n",
     "    fI[y:y+h, x:x+w, :] = cv.medianBlur(fI[y:y+h, x:x+w, :], 7) \n",
     "    fI[fI<0] = 0\n",
     "    fI[fI>255] = 255\n",
@@ -222,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 93,
    "metadata": {
     "tags": []
    },
diff --git a/watermark_removal/watermark_remover.ipynb b/watermark_removal/watermark_remover.ipynb
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -80,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -135,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -164,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -193,7 +193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -226,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Hunt's algorithm to build Decision Trees`
	`2`	`+`
	`3`	`+Build a Decision Tree by splitting based on GINI index. Also outputs the GINI values to explain why the algo chose some attribute.`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +# K-Means
++
 +Given k and k manually selected initial clusters, run K-Means and display each iteration.
++