UBC-MDS · salva-u · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/data/processed/data_cleaned.csv b/data/processed/data_cleaned.csv
diff --git a/data/processed/data_cleaned.parquet b/data/processed/data_cleaned.parquet
diff --git a/data/processed/pulseofthenation.csv b/data/processed/pulseofthenation.csv
diff --git a/environment.yml b/environment.yml
@@ -9,6 +9,7 @@ dependencies:
   - libffi=3.4.2
   - libsqlite=3.45.2
   - libzlib=1.2.13
+  - pyarrow=5.0.0
   #- ncurses=6.4.20240210
   - openssl=3.2.1
   - pip=24.0

diff --git a/notebooks/Data_preprocessing_engineering.ipynb b/notebooks/Data_preprocessing_engineering.ipynb
@@ -1,18 +1,16 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": 21,
+   "cell_type": "markdown",
    "id": "286c7a43-dc59-427a-9a9c-3d5a657e5ef9",
    "metadata": {},
-   "outputs": [],
    "source": [
     "#Data preprocessing and Data Engineering"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 43,
    "id": "78a42bc9-b102-4952-b5b4-07a669000854",
    "metadata": {},
    "outputs": [],
@@ -22,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 44,
    "id": "985299f9-9cca-4828-a6d9-9c698e7dd9b1",
    "metadata": {},
    "outputs": [
@@ -326,7 +324,7 @@
        "[5 rows x 24 columns]"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -349,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 45,
    "id": "4a4cbf3d-9a1c-4364-9994-34b61761fe4c",
    "metadata": {},
    "outputs": [
@@ -393,7 +391,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 46,
    "id": "c5a31d2c-d9eb-4fbb-8404-1fdbbbb82048",
    "metadata": {},
    "outputs": [
@@ -416,7 +414,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 47,
    "id": "53a13879-e3fd-46a5-87c6-0d2ea976e62b",
    "metadata": {},
    "outputs": [
@@ -446,7 +444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 48,
    "id": "14ff977c-203e-40b7-9bcf-4e455e7b1caa",
    "metadata": {},
    "outputs": [
@@ -736,7 +734,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 49,
    "id": "0f8f37c8-0e4d-4c3c-8261-a9203853adcc",
    "metadata": {},
    "outputs": [
@@ -1031,7 +1029,7 @@
        "4                                             DK/REF                                                             "
       ]
      },
-     "execution_count": 28,
+     "execution_count": 49,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1053,7 +1051,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 50,
    "id": "0e23b81c-9fab-4c2e-95e7-40eb0b3d4008",
    "metadata": {},
    "outputs": [
@@ -1257,7 +1255,7 @@
        "4    Somewhat Likely                  Yes                  No        DK/REF  "
       ]
      },
-     "execution_count": 29,
+     "execution_count": 50,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1281,7 +1279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 51,
    "id": "31db4f65-c953-4e06-afd2-e30d82e8a7b7",
    "metadata": {},
    "outputs": [
@@ -1354,7 +1352,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 52,
    "id": "7c613616-412d-40b6-9c2a-da5a19d751d8",
    "metadata": {},
    "outputs": [
@@ -1567,7 +1565,7 @@
        "[5 rows x 22 columns]"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 52,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1591,7 +1589,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 53,
    "id": "4a80cfa9-44dd-4380-97c7-727aff91e822",
    "metadata": {},
    "outputs": [
@@ -1991,7 +1989,7 @@
        "[800 rows x 23 columns]"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 53,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2009,7 +2007,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 54,
    "id": "a6ca55ea-fb5e-4397-b1f3-10a3c61c548e",
    "metadata": {},
    "outputs": [
@@ -2039,7 +2037,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 55,
    "id": "2ef70159-3f09-4112-950f-a1f09d83b5f9",
    "metadata": {},
    "outputs": [
@@ -2067,7 +2065,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [
     {
@@ -2279,7 +2277,7 @@
        "[5 rows x 23 columns]"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 56,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2290,7 +2288,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2299,7 +2297,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [
     {
@@ -2518,7 +2516,7 @@
        "[5 rows x 23 columns]"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2529,24 +2527,74 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 59,
+   "id": "c36f7e10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "2c70e83a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\salva\\AppData\\Local\\Temp\\ipykernel_11076\\3014061960.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n",
+      "  data_cleaned['political_spectrum_score'] = data_cleaned['political_spectrum_score'].replace(\"Don't Know/ Refuse to Answer\", np.nan)\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_cleaned['political_spectrum_score'] = data_cleaned['political_spectrum_score'].replace(\"Don't Know/ Refuse to Answer\", np.nan)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "c11e58b8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([ 1.,  0., -1., nan])"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_cleaned['political_spectrum_score'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
    "id": "2f683129-d2ec-4d65-b6de-59045c4b1ed3",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Data saved to: ../data/processed/data_cleaned.csv\n"
+      "Data saved to: ../data/processed/data_cleaned.parquet\n"
      ]
     }
    ],
    "source": [
     "# output cleaned data\n",
-    "output_file_path = '../data/processed/data_cleaned.csv' \n",
+    "output_file_path = '../data/processed/data_cleaned.parquet'\n",
     "\n",
     "# Save the dataframe to a CSV file without the index\n",
-    "data_dropped.to_csv(output_file_path)\n",
+    "data_dropped.to_parquet(output_file_path)\n",
     "\n",
     "print(\"Data saved to:\", output_file_path)\n"
    ]
@@ -2576,7 +2624,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.2"
+   "version": "3.10.5"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/processing.ipynb b/notebooks/processing.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -119,3 +119,4 @@ websocket-client==1.7.*
 Werkzeug==3.0.*
 wheel==0.43.*
 zipp==3.18.*
+pyarrow==15.0.*