In [None]:
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "from sklearn.metrics import recall_score, precision_score, f1_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_JSON_FILE = 'SpamData/01_Processing/email-text-data.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_json(DATA_JSON_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CATEGORY</th>\n",
       "      <th>MESSAGE</th>\n",
       "      <th>FILE_NAME</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>995</th>\n",
       "      <td>1</td>\n",
       "      <td>PUBLIC ANNOUNCEMENT:\\n\\n\\n\\nThe new .NAME doma...</td>\n",
       "      <td>00496.acf53035be6cb4c667fd342551c5d467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>996</th>\n",
       "      <td>1</td>\n",
       "      <td>This is a multi-part message in MIME format.\\n...</td>\n",
       "      <td>00497.353a61b265f11dd0bae116c0149abbe1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>997</th>\n",
       "      <td>1</td>\n",
       "      <td>PROMOTE YOUR PRODUCT OR SERVICE TO MILLIONS TO...</td>\n",
       "      <td>00498.7f293b818e2e46d3a8bad44eda672947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>998</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;html&gt;\\n\\n&lt;head&gt;\\n\\n&lt;/head&gt;\\n\\n&lt;body&gt;\\n\\n\\n\\n&lt;...</td>\n",
       "      <td>00499.257302b8f6056eb85e0daa37bfcd2c68</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>999</th>\n",
       "      <td>1</td>\n",
       "      <td>As to\\n\\n\\n\\n\\n\\n\\n\\nWant to refinance?\\n\\n\\n\\...</td>\n",
       "      <td>00500.87320162ab5b79f67978406cf909c3d1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     CATEGORY                                            MESSAGE  \\\n",
       "995         1  PUBLIC ANNOUNCEMENT:\\n\\n\\n\\nThe new .NAME doma...   \n",
       "996         1  This is a multi-part message in MIME format.\\n...   \n",
       "997         1  PROMOTE YOUR PRODUCT OR SERVICE TO MILLIONS TO...   \n",
       "998         1  <html>\\n\\n<head>\\n\\n</head>\\n\\n<body>\\n\\n\\n\\n<...   \n",
       "999         1  As to\\n\\n\\n\\n\\n\\n\\n\\nWant to refinance?\\n\\n\\n\\...   \n",
       "\n",
       "                                  FILE_NAME  \n",
       "995  00496.acf53035be6cb4c667fd342551c5d467  \n",
       "996  00497.353a61b265f11dd0bae116c0149abbe1  \n",
       "997  00498.7f293b818e2e46d3a8bad44eda672947  \n",
       "998  00499.257302b8f6056eb85e0daa37bfcd2c68  \n",
       "999  00500.87320162ab5b79f67978406cf909c3d1  "
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5799, 3)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.sort_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CATEGORY</th>\n",
       "      <th>MESSAGE</th>\n",
       "      <th>FILE_NAME</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5794</th>\n",
       "      <td>0</td>\n",
       "      <td>http://news.bbc.co.uk/1/hi/england/2515127.stm...</td>\n",
       "      <td>01396.61983fbe6ec43f55fd44e30fce24ffa6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5795</th>\n",
       "      <td>0</td>\n",
       "      <td>&gt; &gt;-- be careful when using this one.) Also, t...</td>\n",
       "      <td>01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7</td>\n",
       "    </tr>\n"  ,
       "    <tr>\n",
       "      <th>5796</th>\n",
       "      <td>0</td>\n",
       "      <td>&gt;&gt;&gt;&gt;&gt; \"SM\" == Skip Montanaro &lt;skip@pobox.com&gt; ...</td>\n",
       "      <td>01398.169b51731fe569f42169ae8f948ec676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5797</th>\n",
       "      <td>0</td>\n",
       "      <td>So then, \"Mark Hammond\" &lt;mhammond@skippinet.co...</td>\n",
       "      <td>01399.ca6b00b7b341bbde9a9ea3dd6a7bf896</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5798</th>\n",
       "      <td>0</td>\n",
       "      <td>Hi there,\\n\\n\\n\\nNow this is probably of no us...</td>\n",
       "      <td>01400.f897f0931e461e7b2e964d28e927c35e</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CATEGORY                                            MESSAGE  \\\n",
       "5794         0  http://news.bbc.co.uk/1/hi/england/2515127.stm...   \n",
       "5795         0  > >-- be careful when using this one.) Also, t...   \n",
       "5796         0  >>>>> \"SM\" == Skip Montanaro <skip@pobox.com> ...   \n",
       "5797         0  So then, \"Mark Hammond\" <mhammond@skippinet.co...   \n",
       "5798         0  Hi there,\\n\\n\\n\\nNow this is probably of no us...   \n",
       "\n",
       "                                   FILE_NAME  \n",
       "5794  01396.61983fbe6ec43f55fd44e30fce24ffa6  \n",
       "5795  01397.9f9ef4c2a8dc012d80f2ce2d3473d3b7  \n",
       "5796  01398.169b51731fe569f42169ae8f948ec676  \n",
       "5797  01399.ca6b00b7b341bbde9a9ea3dd6a7bf896  \n",
       "5798  01400.f897f0931e461e7b2e964d28e927c35e  "
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = CountVectorizer(stop_words='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_features = vectorizer.fit_transform(data.MESSAGE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(5799, 102694)"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_features.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'doctype': 34865,\n",
       " 'html': 48472,\n",
       " 'public': 74013,\n",
       " 'w3c': 93790,\n",
       " 'dtd': 36354,\n",
       " 'transitional': 88580,\n",
       " 'en': 38432,\n",
       " 'head': 47011,\n",
       " 'meta': 61701,\n",
       " 'content': 30249,\n",
       " '3d': 6385,\n",
       " 'text': 86991,\n",
       " 'charset': 27796,\n",
       " '3dwindows': 7297,\n",
       " '1252': 2025,\n",
       " 'http': 48497,\n",
       " 'equiv': 38991,\n",
       " '3dcontent': 6908,\n",
       " 'ype': 99054,\n",
       " 'mshtml': 63412,\n",
       " '00': 0,\n",
       " '2314': 4235,\n",
       " '1000': 1497,\n",
       " '3dgenerator': 6987,\n",
       " 'body': 24390,\n",
       " 'inserted': 52119,\n",
       " 'calypso': 26557,\n",
       " 'table': 86120,\n",
       " 'border': 24581,\n",
       " '3d0': 6386,\n",
       " 'cellpadding': 27375,\n",
       " 'cellspacing': 27383,\n",
       " '3d2': 6525,\n",
       " 'id': 49828,\n",
       " '3d_calyprintheader_': 6758,\n",
       " 'ules': 90247,\n",
       " '3dnone': 7130,\n",
       " 'style': 84723,\n",
       " 'color': 29367,\n",
       " 'black': 23893,\n",
       " 'display': 34406,\n",
       " 'width': 95488,\n",
       " '100': 1496,\n",
       " 'tbody': 86462,\n",
       " 'tr': 88442,\n",
       " 'td': 86548,\n",
       " 'colspan': 29390,\n",
       " '3d3': 6570,\n",
       " 'hr': 48373,\n",
       " '3dblack': 6863,\n",
       " 'noshade': 65849,\n",
       " 'size': 82347,\n",
       " '3d1': 6417,\n",
       " 'end': 38496,\n",
       " 'font': 42257,\n",
       " '000000': 4,\n",
       " 'face': 40498,\n",
       " '3dverdana': 7283,\n",
       " 'arial': 20116,\n",
       " 'helvetica': 47220,\n",
       " 'br': 24813,\n",
       " 'ff0000': 41235,\n",
       " 'copperplate': 30451,\n",
       " 'gothic': 45303,\n",
       " 'bold': 24438,\n",
       " '3d5': 6641,\n",
       " 'ptsize': 73976,\n",
       " '10': 1495,\n",
       " 'center': 27405,\n",
       " 'save': 80370,\n",
       " '70': 11063,\n",
       " 'life': 58442,\n",
       " 'insurance': 52249,\n",
       " 'ff': 41231,\n",
       " '0000': 2,\n",
       " 'spend': 83583,\n",
       " 'pt': 73869,\n",
       " 'quote': 75613,\n",
       " 'savings': 80383,\n",
       " 'align': 18634,\n",
       " '3dleft': 7063,\n",
       " 'bordercolor': 24584,\n",
       " '111111': 1817,\n",
       " 'wi': 95456,\n",
       " 'dth': 36360,\n",
       " '3d650': 6690,\n",
       " '35': 5987,\n",
       " '3d4': 6607,\n",
       " 'ensurin': 38692,\n",
       " 'family': 40651,\n",
       " 'financial': 41598,\n",
       " 'security': 81026,\n",
       " 'important': 51356,\n",
       " 'ma': 60143,\n",
       " 'kes': 55853,\n",
       " 'buying': 25619,\n",
       " 'simple': 82172,\n",
       " 'affordable': 17601,\n",
       " 'provide': 73660,\n",
       " 'free': 42773,\n",
       " 'access': 16655,\n",
       " 'best': 23129,\n",
       " 'companies': 29576,\n",
       " 'lowest': 59365,\n",
       " 'rates': 76350,\n",
       " '3dmiddle': 7086,\n",
       " 'valign': 92046,\n",
       " '3dtop': 7260,\n",
       " '18': 2693,\n",
       " 'padding': 69143,\n",
       " 'left': 57950,\n",
       " '5px': 9823,\n",
       " 'right': 78422,\n",
       " 'fast': 40734,\n",
       " 'eas': 37267,\n",
       " 'saves': 80378,\n",
       " 'money': 62840,\n",
       " 'let': 58083,\n",
       " 'help': 47200,\n",
       " 'started': 84137,\n",
       " 'val': 92021,\n",
       " 'ues': 89900,\n",
       " 'country': 30655,\n",
       " 'new': 64988,\n",
       " 'coverage': 30705,\n",
       " 'hundreds': 48607,\n",
       " 'tho': 87389,\n",
       " 'usands': 91289,\n",
       " 'dollars': 34945,\n",
       " 'requesting': 77710,\n",
       " 'lifequote': 58457,\n",
       " 'service': 81359,\n",
       " 'minutes': 62236,\n",
       " 'complete': 29666,\n",
       " 'shop': 81840,\n",
       " 'compare': 29592,\n",
       " 'types': 89378,\n",
       " 'height': 47157,\n",
       " '3d50': 6642,\n",
       " '3dcenter': 6892,\n",
       " 'href': 48390,\n",
       " 'website': 95049,\n",
       " 'e365': 36945,\n",
       " 'cc': 27105,\n",
       " 'savequote': 80375,\n",
       " 'click': 28719,\n",
       " 'strong': 84630,\n",
       " 'protecting': 73614,\n",
       " 'investment': 52605,\n",
       " 'll': 58888,\n",
       " 'eve': 39482,\n",
       " 'make': 60515,\n",
       " 'sans': 80256,\n",
       " 'serif': 81304,\n",
       " 'receipt': 76778,\n",
       " 'email': 38216,\n",
       " 'error': 39109,\n",
       " 'wish': 95691,\n",
       " 'removed': 77482,\n",
       " 'list': 58680,\n",
       " 'mailto': 60412,\n",
       " 'coins': 29267,\n",
       " 'btamail': 25250,\n",
       " 'net': 64868,\n",
       " 'cn': 29033,\n",
       " 'type': 89371,\n",
       " 'remove': 77467,\n",
       " 'reside': 77791,\n",
       " 'state': 84171,\n",
       " 'prohibits': 73445,\n",
       " 'mail': 60312,\n",
       " 'solicitations': 83135,\n",
       " 'insuran': 52248,\n",
       " 'ce': 27294,\n",
       " 'disregard': 34433,\n",
       " 'fight': 41505,\n",
       " 'risk': 78497,\n",
       " 'cancer': 26620,\n",
       " 'www': 96531,\n",
       " 'adclick': 17058,\n",
       " 'ws': 96300,\n",
       " 'cfm': 27528,\n",
       " '315': 5723,\n",
       " 'pk007': 71825,\n",
       " 'slim': 82620,\n",
       " 'guaranteed': 45873,\n",
       " 'lose': 59292,\n",
       " '12': 1955,\n",
       " 'lbs': 57733,\n",
       " '30': 5593,\n",
       " 'days': 32422,\n",
       " '249': 4396,\n",
       " 'child': 27974,\n",
       " 'support': 85179,\n",
       " 'deserve': 33354,\n",
       " 'legal': 57964,\n",
       " 'advice': 17358,\n",
       " '245': 4378,\n",
       " 'pk002': 71824,\n",
       " 'join': 54666,\n",
       " 'web': 94964,\n",
       " 'fastest': 40741,\n",
       " 'growing': 45726,\n",
       " 'singles': 82236,\n",
       " 'community': 29561,\n",
       " '259': 4511,\n",
       " 'start': 84135,\n",
       " 'private': 73228,\n",
       " 'photo': 71334,\n",
       " 'album': 18562,\n",
       " 'online': 67665,\n",
       " '283': 4724,\n",
       " 'wonderful': 95974,\n",
       " 'day': 32411,\n",
       " 'offer': 67158,\n",
       " 'manager': 60604,\n",
       " 'prizemama': 73248,\n",
       " 'leave': 57913,\n",
       " 'use': 91315,\n",
       " 'link': 58601,\n",
       " 'qves': 75647,\n",
       " 'com': 29405,\n",
       " 'trim': 88771,\n",
       " 'ilug': 51126,\n",
       " 'linux': 58631,\n",
       " '7c17': 11619,\n",
       " '7c114258': 11617,\n",
       " 'irish': 52919,\n",
       " 'users': 91367,\n",
       " 'group': 45712,\n",
       " 'mailman': 60393,\n",
       " 'listinfo': 58724,\n",
       " 'subscription': 84844,\n",
       " 'information': 51875,\n",
       " 'maintainer': 60489,\n",
       " 'listmaster': 58729,\n",
       " 'zzzz': 101425,\n",
       " 'spamassassin': 83404,\n",
       " 'taint': 86206,\n",
       " 'org': 68066,\n",
       " '7c308417': 11623,\n",
       " 'adult': 17302,\n",
       " 'club': 28856,\n",
       " 'offers': 67168,\n",
       " 'membership': 61539,\n",
       " 'instant': 52197,\n",
       " 'sites': 82305,\n",
       " 'user': 91337,\n",
       " 'password': 69573,\n",
       " '760382': 11376,\n",
       " 'internet': 52404,\n",
       " 'news': 65033,\n",
       " '08': 770,\n",
       " '02': 396,\n",
       " 'just': 55049,\n",
       " 'million': 62115,\n",
       " 'members': 61538,\n",
       " 'signed': 82087,\n",
       " 'month': 62908,\n",
       " '721': 11185,\n",
       " '184': 2732,\n",
       " 'faq': 40686,\n",
       " 'offering': 67166,\n",
       " 'advertisers': 17347,\n",
       " 'pay': 69685,\n",
       " 'ad': 16973,\n",
       " 'space': 83363,\n",
       " 'don': 34992,\n",
       " 'true': 88876,\n",
       " 'absolutely': 16493,\n",
       " 'cent': 27401,\n",
       " 'account': 16712,\n",
       " 'friends': 42933,\n",
       " 'yes': 98554,\n",
       " 'long': 59207,\n",
       " 'age': 17749,\n",
       " 'sign': 82070,\n",
       " 'following': 42233,\n",
       " 'links': 58616,\n",
       " 'member': 61536,\n",
       " 'multi': 63629,\n",
       " 'dollar': 34944,\n",
       " 'operations': 67834,\n",
       " 'policies': 72341,\n",
       " 'rules': 79420,\n",
       " 'required': 77713,\n",
       " 'info': 51836,\n",
       " 'won': 95971,\n",
       " 'charge': 27776,\n",
       " 'pass': 69540,\n",
       " 'believe': 22983,\n",
       " 'read': 76610,\n",
       " 'terms': 86896,\n",
       " 'conditions': 29848,\n",
       " 'adults': 17306,\n",
       " 'farm': 40704,\n",
       " '80': 12020,\n",
       " '71': 11128,\n",
       " '66': 10344,\n",
       " 'aid': 18121,\n",
       " 'girls': 44716,\n",
       " 'animals': 19287,\n",
       " 'getting': 44434,\n",
       " 'freaky': 42763,\n",
       " 'lifetime': 58463,\n",
       " 'sexy': 81461,\n",
       " 'celebes': 27343,\n",
       " 'celebst': 27357,\n",
       " 'thousands': 87420,\n",
       " 'xxx': 97882,\n",
       " 'doing': 34927,\n",
       " 'play': 71975,\n",
       " 'house': 48262,\n",
       " 'porn': 72495,\n",
       " 'mega': 61469,\n",
       " 'live': 58771,\n",
       " 'feeds': 41099,\n",
       " '60': 10038,\n",
       " 'cams': 26601,\n",
       " 'asian': 20388,\n",
       " 'sex': 81443,\n",
       " 'fantasies': 40674,\n",
       " 'japanese': 53854,\n",
       " 'schoolgirls': 80609,\n",
       " 'shows': 81920,\n",
       " 'lesbian': 58064,\n",
       " 'lace': 57390,\n",
       " 'jennifer': 54112,\n",
       " 'simpson': 82185,\n",
       " 'miami': 61933,\n",
       " 'fl': 41836,\n",
       " 'entertained': 38714,\n",
       " 'boyffriend': 24717,\n",
       " 'years': 98506,\n",
       " 'joe': 54621,\n",
       " 'morgan': 62982,\n",
       " 'manhattan': 60644,\n",
       " 'ny': 66430,\n",
       " 'unbelievable': 90434,\n",
       " 'removal': 77461,\n",
       " 'instructions': 52224,\n",
       " 'received': 76784,\n",
       " 'advertisement': 17344,\n",
       " 'opted': 67899,\n",
       " 'receive': 76783,\n",
       " 'specials': 83517,\n",
       " 'affiliated': 17581,\n",
       " 'websites': 95051,\n",
       " 'emails': 38248,\n",
       " 'opt': 67895,\n",
       " 'database': 32322,\n",
       " 'optout': 67950,\n",
       " 'allow': 18722,\n",
       " '24': 4349,\n",
       " 'hours': 48261,\n",
       " 'vonolmosatkirekpups': 93231,\n",
       " 'thought': 87414,\n",
       " 'like': 58501,\n",
       " 'freeyankee': 42863,\n",
       " 'cgi': 27613,\n",
       " 'fy2': 43474,\n",
       " '822slim1': 12197,\n",
       " '822nic1': 12195,\n",
       " '822ppl1': 12196,\n",
       " 'daily': 32175,\n",
       " 'deals': 32711,\n",
       " 'social': 83033,\n",
       " '7c29': 11620,\n",
       " '7c134077': 11618,\n",
       " 'events': 39492,\n",
       " 'powerhouse': 72703,\n",
       " 'gifting': 44655,\n",
       " 'program': 73408,\n",
       " 'want': 94159,\n",
       " 'miss': 62318,\n",
       " 'founders': 42543,\n",
       " 'major': 60507,\n",
       " 'players': 71981,\n",
       " 'invitation': 52616,\n",
       " 'experts': 39930,\n",
       " 'calling': 26532,\n",
       " 'way': 94335,\n",
       " 'huge': 48542,\n",
       " 'cash': 26894,\n",
       " 'flow': 42029,\n",
       " 'conceived': 29784,\n",
       " 'leverage': 58128,\n",
       " '000': 1,\n",
       " '50': 8926,\n",
       " 'question': 75527,\n",
       " 'wealthy': 94952,\n",
       " 'tossing': 88256,\n",
       " 'lifeline': 58454,\n",
       " 'sake': 80117,\n",
       " 'hope': 48135,\n",
       " 'grab': 45437,\n",
       " 'hold': 47968,\n",
       " 'tight': 87584,\n",
       " 'ride': 78393,\n",
       " 'testimonials': 86959,\n",
       " 'hear': 47077,\n",
       " 'average': 21070,\n",
       " 'people': 70441,\n",
       " 've': 92282,\n",
       " 'al': 18497,\n",
       " 'single': 82233,\n",
       " 'mother': 63060,\n",
       " 'sure': 85211,\n",
       " 'sent': 81223,\n",
       " 'pledge': 72020,\n",
       " 'got': 45295,\n",
       " 'ky': 57016,\n",
       " 'didn': 33961,\n",
       " 'partner': 69523,\n",
       " 'work': 96033,\n",
       " 'think': 87327,\n",
       " 'decision': 32814,\n",
       " 'pick': 71493,\n",
       " 'gave': 44000,\n",
       " 'leads': 57878,\n",
       " 'training': 88523,\n",
       " 'ca': 26367,\n",
       " 'announcing': 19358,\n",
       " 'close': 28818,\n",
       " 'sales': 80134,\n",
       " 'fax': 40806,\n",
       " 'blast': 23942,\n",
       " 'immediately': 51253,\n",
       " 'entry': 38768,\n",
       " 'wait': 94058,\n",
       " '800': 12021,\n",
       " '421': 8001,\n",
       " '6318': 10184,\n",
       " '896': 12868,\n",
       " '6568': 10317,\n",
       " 'name__________________________________phone___________________________________________': 64363,\n",
       " 'fax_____________________________________email____________________________________________': 40808,\n",
       " 'time': 87625,\n",
       " 'call_________________________time': 26520,\n",
       " 'zone________________________________________': 100693,\n",
       " 'message': 61680,\n",
       " 'compliance': 29680,\n",
       " 'section': 80998,\n",
       " '301': 5607,\n",
       " 'paragraph': 69376,\n",
       " '1618': 2477,\n",
       " 'transmissions': 88603,\n",
       " 'sender': 81185,\n",
       " 'stopped': 84453,\n",
       " 'cost': 30578,\n",
       " 'sending': 81193,\n",
       " 'reply': 77619,\n",
       " 'address': 17095,\n",
       " 'word': 96017,\n",
       " 'subject': 84789,\n",
       " 'line': 58568,\n",
       " 'errors': 39110,\n",
       " 'omissions': 67573,\n",
       " 'exceptions': 39688,\n",
       " 'excluded': 39721,\n",
       " 'spam': 83390,\n",
       " 'compiled': 29644,\n",
       " 'replicate': 77615,\n",
       " 'relative': 77310,\n",
       " 'seattle': 80953,\n",
       " 'marketing': 60798,\n",
       " 'gigt': 44671,\n",
       " 'turbo': 89114,\n",
       " 'team': 86598,\n",
       " 'sole': 83126,\n",
       " 'purpose': 74147,\n",
       " 'communications': 29551,\n",
       " 'continued': 30279,\n",
       " 'inclusion': 51534,\n",
       " 'gracious': 45446,\n",
       " 'permission': 70552,\n",
       " 'send': 81178,\n",
       " 'tesrewinter': 86937,\n",
       " 'yahoo': 98304,\n",
       " 'deleted': 33060,\n",
       " 'wanted': 94160,\n",
       " '14': 2224,\n",
       " 'year': 98499,\n",
       " 'old': 67460,\n",
       " 'fortune': 42500,\n",
       " '500': 8927,\n",
       " 'company': 29579,\n",
       " 'tremendous': 88713,\n",
       " 'rate': 76347,\n",
       " 'looking': 59228,\n",
       " 'individuals': 51723,\n",
       " 'home': 48006,\n",
       " 'opportunity': 67871,\n",
       " 'excellent': 39680,\n",
       " 'income': 51540,\n",
       " 'experience': 39912,\n",
       " 'train': 88519,\n",
       " 'employed': 38374,\n",
       " 'career': 26784,\n",
       " 'vast': 92136,\n",
       " 'opportunities': 67870,\n",
       " 'basetel': 22498,\n",
       " 'wealthnow': 94949,\n",
       " 'energetic': 38549,\n",
       " 'self': 81125,\n",
       " 'motivated': 63072,\n",
       " 'form': 42425,\n",
       " 'employement': 38378,\n",
       " 'specialist': 83507,\n",
       " 'contact': 30198,\n",
       " '4139volw7': 7958,\n",
       " '758dody1425frhm1': 11352,\n",
       " '764smfc8513fcsll40': 11387,\n",
       " 'title': 87725,\n",
       " 'reliaquote': 77357,\n",
       " 'iso': 53094,\n",
       " '8859': 12828,\n",
       " 'leftmargin': 57955,\n",
       " 'topmargin': 88205,\n",
       " 'ffcc99': 41285,\n",
       " 'vlink': 93067,\n",
       " 'alink': 18642,\n",
       " 'ffcc00': 41282,\n",
       " '468': 8229,\n",
       " 'bgcolor': 23297,\n",
       " '993366': 13988,\n",
       " '43': 8047,\n",
       " '56': 9286,\n",
       " 'theadmanager': 87189,\n",
       " 'server': 81344,\n",
       " 'asp': 20429,\n",
       " 'ad_key': 17010,\n",
       " 'yuesbhwakmlk': 99230,\n",
       " 'ext': 40021,\n",
       " 'target': 86349,\n",
       " '_blank': 14794,\n",
       " 'img': 51212,\n",
       " 'src': 83872,\n",
       " 'banner': 22378,\n",
       " 'bannerads': 22382,\n",
       " 'images': 51166,\n",
       " 'logo6': 59167,\n",
       " 'gif': 44644,\n",
       " '120': 1956,\n",
       " '32': 5755,\n",
       " '44': 8096,\n",
       " 'nbsp': 64561,\n",
       " '377': 6126,\n",
       " 'ffccff': 41287,\n",
       " 'headline6': 47044,\n",
       " '220': 3998,\n",
       " '72': 11179,\n",
       " '336699': 5883,\n",
       " '34': 5921,\n",
       " '45': 8150,\n",
       " 'tahoma': 86190,\n",
       " 'verdana': 92415,\n",
       " 'change': 27718,\n",
       " 'protect': 73612,\n",
       " 'future': 43330,\n",
       " 'sufficient': 84976,\n",
       " 'middle': 62022,\n",
       " '27': 4621,\n",
       " 'freequotes6': 42829,\n",
       " '177': 2653,\n",
       " '3366cc': 5884,\n",
       " '294': 4811,\n",
       " 'freequoteform': 42828,\n",
       " 'action': 16898,\n",
       " 'termlife': 86892,\n",
       " 'bin': 23627,\n",
       " 'banner_quote_resp': 22381,\n",
       " 'method': 61737,\n",
       " 'input': 52071,\n",
       " 'hidden': 47506,\n",
       " 'value': 92060,\n",
       " '51100004022000000321': 9025,\n",
       " 'sourceid': 83310,\n",
       " 'class': 28606,\n",
       " 'hptbcolor': 48335,\n",
       " 'ffffff': 41323,\n",
       " 'residence': 77792,\n",
       " '65': 10286,\n",
       " 'select': 81106,\n",
       " 'formscombobox12': 42456,\n",
       " 'option': 67936,\n",
       " 'selected': 81113,\n",
       " 'alabama': 18518,\n",
       " 'alaska': 18536,\n",
       " 'arizona': 20133,\n",
       " 'arkansas': 20138,\n",
       " 'california': 26515,\n",
       " 'colorado': 29369,\n",
       " 'connecticut': 30020,\n",
       " 'delaware': 33047,\n",
       " 'dc': 32493,\n",
       " 'dist': 34461,\n",
       " 'columbia': 29394,\n",
       " 'florida': 42018,\n",
       " 'georgia': 44348,\n",
       " 'hawaii': 46838,\n",
       " 'idaho': 49961,\n",
       " 'illinois': 51096,\n",
       " 'indiana': 51675,\n",
       " 'iowa': 52701,\n",
       " 'kansas': 55546,\n",
       " 'kentucky': 55819,\n",
       " 'louisiana': 59329,\n",
       " 'maine': 60475,\n",
       " 'maryland': 60875,\n",
       " 'massachusetts': 60898,\n",
       " 'michigan': 61955,\n",
       " 'minnesota': 62221,\n",
       " 'mississippi': 62333,\n",
       " 'missouri': 62337,\n",
       " 'montana': 62901,\n",
       " 'nebraska': 64714,\n",
       " 'nevada': 64982,\n",
       " 'hampshire': 46564,\n",
       " 'jersey': 54132,\n",
       " 'mexico': 61765,\n",
       " 'york': 98983,\n",
       " 'north': 65816,\n",
       " 'carolina': 26833,\n",
       " 'dakota': 32190,\n",
       " 'ohio': 67295,\n",
       " 'oklahoma': 67431,\n",
       " 'oregon': 68057,\n",
       " 'pennsylvania': 70420,\n",
       " 'rhode': 78314,\n",
       " 'island': 53074,\n",
       " 'south': 83318,\n",
       " 'tennessee': 86845,\n",
       " 'texas': 86988,\n",
       " 'utah': 91446,\n",
       " 'vermont': 92448,\n",
       " 'virginia': 92855,\n",
       " 'washington': 94264,\n",
       " 'west': 95186,\n",
       " 'wisconsin': 95685,\n",
       " 'wyoming': 96618,\n",
       " 'date': 32345,\n",
       " 'birth': 23701,\n",
       " 'mm': 62558,\n",
       " 'dd': 32615,\n",
       " 'yy': 99538,\n",
       " 'nowrap': 65943,\n",
       " '01': 309,\n",
       " '03': 490,\n",
       " '04': 552,\n",
       " '05': 601,\n",
       " '06': 657,\n",
       " '07': 708,\n",
       " '09': 844,\n",
       " '11': 1798,\n",
       " '13': 2107,\n",
       " '15': 2339,\n",
       " '16': 2462,\n",
       " '17': 2583,\n",
       " '19': 2800,\n",
       " '20': 3486,\n",
       " '21': 3894,\n",
       " '22': 3997,\n",
       " '23': 4214,\n",
       " '25': 4433,\n",
       " '26': 4538,\n",
       " '28': 4710,\n",
       " '29': 4793,\n",
       " '31': 5688,\n",
       " '33': 5831,\n",
       " '36': 6037,\n",
       " '37': 6094,\n",
       " '38': 6143,\n",
       " '39': 6183,\n",
       " '40': 7824,\n",
       " '41': 7939,\n",
       " '42': 7997,\n",
       " '46': 8205,\n",
       " '47': 8247,\n",
       " '48': 8295,\n",
       " '49': 8344,\n",
       " '51': 9018,\n",
       " '52': 9072,\n",
       " '53': 9118,\n",
       " '54': 9185,\n",
       " '55': 9236,\n",
       " '57': 9340,\n",
       " '58': 9372,\n",
       " '59': 9412,\n",
       " '61': 10092,\n",
       " '62': 10140,\n",
       " '63': 10179,\n",
       " '64': 10230,\n",
       " '67': 10432,\n",
       " '68': 10469,\n",
       " '69': 10505,\n",
       " '73': 11215,\n",
       " '74': 11261,\n",
       " '75': 11317,\n",
       " '76': 11374,\n",
       " '77': 11411,\n",
       " '78': 11460,\n",
       " '79': 11500,\n",
       " '81': 12114,\n",
       " '82': 12178,\n",
       " '83': 12259,\n",
       " '84': 12352,\n",
       " 'formsradiobutton1': 42457,\n",
       " 'radio': 76125,\n",
       " 'male': 60546,\n",
       " 'formsradiobutton2': 42458,\n",
       " 'female': 41148,\n",
       " 'used': 91322,\n",
       " 'tobacco': 88025,\n",
       " 'products': 73349,\n",
       " 'months': 62916,\n",
       " 'formsradiobutton4': 42460,\n",
       " 'formsradiobutton3': 42459,\n",
       " 'formscombobox11': 42455,\n",
       " '100000': 1499,\n",
       " '125000': 2023,\n",
       " '125': 2021,\n",
       " '150000': 2343,\n",
       " '150': 2340,\n",
       " '175000': 2635,\n",
       " '175': 2633,\n",
       " '200000': 3490,\n",
       " '200': 3487,\n",
       " '225000': 4085,\n",
       " '225': 4082,\n",
       " '250000': 4436,\n",
       " '250': 4434,\n",
       " '275000': 4664,\n",
       " '275': 4662,\n",
       " '300000': 5596,\n",
       " '300': 5594,\n",
       " '325000': 5789,\n",
       " '325': 5787,\n",
       " '350000': 5990,\n",
       " '350': 5988,\n",
       " '375000': 6121,\n",
       " '375': 6120,\n",
       " '400000': 7827,\n",
       " '400': 7825,\n",
       " '425000': 8019,\n",
       " '425': 8018,\n",
       " '450000': 8153,\n",
       " '450': 8151,\n",
       " '475000': 8266,\n",
       " '475': 8265,\n",
       " '500000': 8930,\n",
       " '550000': 9239,\n",
       " '550': 9237,\n",
       " '600000': 10042,\n",
       " '600': 10039,\n",
       " '650000': 10289,\n",
       " '650': 10287,\n",
       " '700000': 11066,\n",
       " '700': 11064,\n",
       " '750000': 11320,\n",
       " '750': 11318,\n",
       " '800000': 12024,\n",
       " '850000': 12459,\n",
       " '850': 12457,\n",
       " '900000': 13582,\n",
       " '900': 13580,\n",
       " '950000': 13790,\n",
       " '950': 13788,\n",
       " '1000000': 1500,\n",
       " '1250000': 2024,\n",
       " '1500000': 2344,\n",
       " '1750000': 2636,\n",
       " '2000000': 3491,\n",
       " '2250000': 4086,\n",
       " '2500000': 4437,\n",
       " '3000000': 5597,\n",
       " '3500000': 5991,\n",
       " '4000000': 7828,\n",
       " '4500000': 8154,\n",
       " '5000000': 8931,\n",
       " '6000000': 10043,\n",
       " '7000000': 11067,\n",
       " '8000000': 12025,\n",
       " '9000000': 13583,\n",
       " '10000000': 1501,\n",
       " '11000000': 1801,\n",
       " '12000000': 1959,\n",
       " '13000000': 2111,\n",
       " '14000000': 2228,\n",
       " '15000000': 2345,\n",
       " 'need': 64740,\n",
       " 'formscombobox10': 42454,\n",
       " 'plan': 71920,\n",
       " 'formsbutton1': 42453,\n",
       " 'submit': 84812,\n",
       " 'submitbutton': 84814,\n",
       " 'photo6': 71343,\n",
       " 'jpg': 54782,\n",
       " '210': 3895,\n",
       " '173': 2617,\n",
       " '217': 3962,\n",
       " 'div': 34529,\n",
       " 'makes': 60530,\n",
       " 'easy': 37289,\n",
       " 'instantly': 52201,\n",
       " 'quotes': 75617,\n",
       " 'highly': 47536,\n",
       " 'rated': 76348,\n",
       " '106': 1718,\n",
       " '70percent6': 11124,\n",
       " '131': 2120,\n",
       " 'ffcc33': 41283,\n",
       " 'today': 88039,\n",
       " '0000ff': 53,\n",
       " 'copyright': 30464,\n",
       " '2001': 3501,\n",
       " 'rights': 78433,\n",
       " 'reserved': 77781,\n",
       " 'admanmail': 17188,\n",
       " 'hspace': 48437,\n",
       " 'vspace': 93433,\n",
       " 'alt': 18803,\n",
       " 'receiving': 76794,\n",
       " 'mailing': 60381,\n",
       " 'sendgreatoffers': 81190,\n",
       " 'subscribed': 84838,\n",
       " 'jm': 54453,\n",
       " 'netnoteinc': 64921,\n",
       " 'unsubscribe': 90955,\n",
       " 'em': 38188,\n",
       " 'sgo': 81543,\n",
       " 'include': 51527,\n",
       " 'unsubscribed': 90959,\n",
       " 'correspondence': 30542,\n",
       " 'services': 81361,\n",
       " 'directed': 34157,\n",
       " 'tired': 87709,\n",
       " 'bull': 25428,\n",
       " 'stop': 84428,\n",
       " 'losing': 59299,\n",
       " 'real': 76643,\n",
       " 'maker': 60527,\n",
       " 'big': 23545,\n",
       " 'boys': 24724,\n",
       " 'drive': 36212,\n",
       " 'doorstep': 35045,\n",
       " 'short': 81854,\n",
       " 'period': 70505,\n",
       " 'print': 73183,\n",
       " 'successleads': 84921,\n",
       " 'firemail': 41673,\n",
       " 'telephone': 86754,\n",
       " 'number': 66246,\n",
       " 'responding': 77868,\n",
       " '499': 8372,\n",
       " '99': 13970,\n",
       " 'name___________________________________': 64356,\n",
       " 'phone___________________________________': 71315,\n",
       " 'fax_____________________________________': 40807,\n",
       " 'email___________________________________': 38225,\n",
       " 'dear': 32719,\n",
       " 'ricardo1': 78361,\n",
       " 'red': 76938,\n",
       " 'effective': 37699,\n",
       " 'direct': 34154,\n",
       " 'advertising': 17349,\n",
       " 'blue': 24106,\n",
       " 'promote': 73486,\n",
       " 'business': 25553,\n",
       " 'low': 59355,\n",
       " 'addresses': 17113,\n",
       " '44c300': 8138,\n",
       " 'maximize': 61024,\n",
       " '309': 5644,\n",
       " '407': 7888,\n",
       " '7378': 11243,\n",
       " 'consultant': 30169,\n",
       " 'discuss': 34313,\n",
       " 'needs': 64750,\n",
       " '___________________________________________________________________': 14716,\n",
       " '_______________________________________________________________': 14712,\n",
       " '________________________________________________________________': 14713,\n",
       " 'city': 28382,\n",
       " '_____________________________________________________________________': 14718,\n",
       " 'phone': 71311,\n",
       " '__________________________________________________________________': 14715,\n",
       " '_______________________________________________________': 14704,\n",
       " '___________________________________________________________________________': 14722,\n",
       " 'comments': 29493,\n",
       " 'details': 33442,\n",
       " 'pricing': 73147,\n",
       " 'market': 60791,\n",
       " '247': 4387,\n",
       " 'po1': 72252,\n",
       " 'kj': 56221,\n",
       " '_8j7bjk9': 14652,\n",
       " 'tg0bk5nkiys5': 87089,\n",
       " 'cellular': 27384,\n",
       " 'accessories': 16666,\n",
       " 'wholesale': 95421,\n",
       " 'prices': 73143,\n",
       " '202': 3687,\n",
       " '101': 1549,\n",
       " '163': 2494,\n",
       " 'merchant': 61627,\n",
       " 'hands': 46602,\n",
       " 'ear': 37240,\n",
       " 'buds': 25350,\n",
       " 'holsters': 47999,\n",
       " '98': 13914,\n",
       " 'booster': 24545,\n",
       " 'antennas': 19434,\n",
       " 'cases': 26892,\n",
       " 'car': 26739,\n",
       " 'chargers': 27779,\n",
       " 'plates': 71959,\n",
       " 'lithium': 58752,\n",
       " 'ion': 52677,\n",
       " 'batteries': 22559,\n",
       " '94': 13749,\n",
       " 'nokia': 65693,\n",
       " 'motorola': 63090,\n",
       " 'lg': 58206,\n",
       " 'nextel': 65100,\n",
       " 'samsung': 80202,\n",
       " 'qualcomm': 75445,\n",
       " 'ericsson': 39059,\n",
       " 'audiovox': 20836,\n",
       " 'phones': 71323,\n",
       " 'assistance': 20504,\n",
       " '732': 11222,\n",
       " '751': 11330,\n",
       " '1457': 2281,\n",
       " 'mailings': 60385,\n",
       " 'request': 77707,\n",
       " 'removemenow68994': 77491,\n",
       " 'thank': 87162,\n",
       " 'super': 85106,\n",
       " 'free4pornlovers': 42778,\n",
       " 'simply': 82183,\n",
       " 'amateur': 18895,\n",
       " '990000': 13973,\n",
       " '660066': 10347,\n",
       " '9933ff': 13989,\n",
       " 'girl': 44713,\n",
       " 'door': 35041,\n",
       " 'tour': 88299,\n",
       " 'ff0099': 41240,\n",
       " 'photos': 71361,\n",
       " 'sneeky': 82928,\n",
       " 'nude': 66215,\n",
       " 'exibitionists': 39815,\n",
       " 'cheating': 27848,\n",
       " 'wives': 95729,\n",
       " 'girlfriends': 44715,\n",
       " 'pl': 71889,\n",
       " 'cell': 27364,\n",
       " 'spacer': 83368,\n",
       " '185': 2738,\n",
       " 'rowspan': 79053,\n",
       " 'qbfuiexorxkl': 74718,\n",
       " '168': 2537,\n",
       " '143': 2261,\n",
       " '181': 2709,\n",
       " 'htmlemails': 48482,\n",
       " 't193no_option_01': 85834,\n",
       " 't193no_option_02': 85835,\n",
       " 't193no_option_03': 85836,\n",
       " 't193no_option_04': 85837,\n",
       " 't193no_option_05': 85838,\n",
       " 't193no_option_06': 85839,\n",
       " '115': 1857,\n",
       " 't193no_option_07': 85840,\n",
       " '154': 2395,\n",
       " 't193no_option_08': 85841,\n",
       " 't193no_option_09': 85842,\n",
       " 't193no_option_10': 85843,\n",
       " 't193no_option_11': 85844,\n",
       " 't193no_option_12': 85845,\n",
       " 't193no_option_13': 85846,\n",
       " 't193no_option_14': 85847,\n",
       " '265': 4565,\n",
       " 'simplywireless': 82184,\n",
       " 'swdotcomlogo1': 85420,\n",
       " '273': 4648,\n",
       " 'voicestream': 93191,\n",
       " 'wireless': 95671,\n",
       " 'credit': 30975,\n",
       " 'approval': 19831,\n",
       " 'activate': 16908,\n",
       " 'activation': 16911,\n",
       " 'fee': 41093,\n",
       " 'applies': 19782,\n",
       " 'activations': 16912,\n",
       " 'available': 21047,\n",
       " 'areas': 20075,\n",
       " 'fulfilled': 43230,\n",
       " 'authorized': 20930,\n",
       " 'dealer': 32706,\n",
       " 'site': 82284,\n",
       " ...}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorizer.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, \n",
    "                                                   test_size=0.3, random_state=88)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(4059, 102694) "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1740, 102694)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "classifier = MultinomialNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Challenge:** Calculate the following for the test dataset: <br>\n",
    "The number of documents classified correctly. <br>\n",
    "The number of documents classified incorrectly. <br>\n",
    "The accuracy of the model. <br>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "nr_correct = (y_test == classifier.predict(X_test)).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1638 documents classfied correctly\n"
     ]
    }
   ],
   "source": [
    "print(f'{nr_correct} documents classfied correctly')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "nr_incorrect = y_test.size - nr_correct"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of documents incorrectly classified is 102\n"
     ]
    }
   ],
   "source": [
    "print(f'Number of documents incorrectly classified is {nr_incorrect}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The (testing) accuracy of the model is 94.14%\n"
     ]
    }
   ],
   "source": [
    "fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)\n",
    "print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9413793103448276"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Challenge:** For the testing dataset calculate the recall, precision and f1 score. Google for the scikit learn documentation on this topic to work it out. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8288288288288288"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recall_score(y_test, classifier.predict(X_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9850107066381156"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "precision_score(y_test, classifier.predict(X_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9001956947162426"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f1_score(y_test, classifier.predict(X_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "example = ['get viagra for free now!', \n",
    "          'need a mortgage? Reply to arrange a call with a specialist and get a quote', \n",
    "          'Could you please help me with the project for tomorrow?', \n",
    "          'Hello Jonathan, how about a game of golf tomorrow?', \n",
    "          'Ski jumping is a winter sport in which competitors aim to achieve the longest jump after descending from a specially designed ramp on their skis. Along with jump length, competitor\\'s style and other factors affect the final score. Ski jumping was first contested in Norway in the late 19th century, and later spread through Europe and North America in the early 20th century. Along with cross-country skiing, it constitutes the traditional group of Nordic skiing disciplines.'\n",
    "          ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_term_matrix = vectorizer.transform(example)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 1, 0, 0, 0], dtype=int64)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "classifier.predict(doc_term_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
