In [1]:
import re

from IPython.core.display import display, HTML

import pickle
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from PIL import Image
import PIL.ImageOps

In [2]:
with open('../../data/clean/misconduct_train.pkl', 'rb') as train_file:
    unindicated_misconduct = pickle.load(train_file)

with open('../../data/clean/misconduct_indicators_train.pkl', 'rb') as train_file:
    misconduct = pickle.load(train_file)

In [3]:
indicated_cols = [
    'misconduct_type',
    'court_type',
    'enforcement_agency',
    'contracting_party'
]

 # the one hot columns start with the indicated_col's name
indicator_features = [
    col for col in misconduct.columns
    if re.search(f'^({"|".join(indicated_cols)})_', col)
]

design_matrix = misconduct[[*indicator_features]]
# design_matrix = pd.concat(
#     [
#         design_matrix,
#         pd.get_dummies(
#             misconduct['disposition_type'],
#             prefix='disposition_type'
#         )
#     ],
#     axis=1
# )

In [4]:
reduced = TSNE(n_components=2).fit_transform(design_matrix)

In [5]:
misconduct_for_csv = unindicated_misconduct.copy()
misconduct_for_csv['year'] = misconduct_for_csv['date'].apply(lambda d : str(d.year))
export_cols = [
    'instance',
    'year',
    'contractor_s',
    'misconduct_type',
    'court_type',
    'enforcement_agency',
    'contracting_party',
    'disposition_type'
]

for col in export_cols:
    misconduct_for_csv[col] = misconduct_for_csv[col].str.replace(',', '').str.slice(0, 50)

labeled_reduced = np.concatenate(
    (
        reduced,
        misconduct_for_csv[export_cols]
    ),
    axis=1
)

In [6]:
# export to Tableau
# hosted at https://public.tableau.com/profile/luke.persola#!/vizhome/misconduct_3/MisconductTSNE

np.savetxt(
    'TSNE.csv',
    labeled_reduced,
    delimiter=',',
    fmt='%.18e,%.18e,%s,%s,%s,%s,%s,%s,%s,%s'
)