# Project 3 Raw to clean

## First, Imports and Reading Data

In [None]:
import itertools
import numpy as np
import pandas as pd 
from numbers import Number
from scipy import stats
from matplotlib.colors import ListedColormap
import warnings
warnings.filterwarnings('ignore')

import pickle

In [None]:
df = pd.read_csv('NPDB2401.csv')
df

### Get only Modern Day Malpractice Payment 

In [None]:
df = df[df['RECTYPE'].isin(['P'])]
df

### Get the amount of years of the malpractice record has gone on 

In [None]:
df['MALTIME'] = df['MALYEAR1'] - df['MALYEAR2'].map(lambda x: 0 if pd.isnull(x) else x)
df['MALTIME'] = df['MALTIME'].map(lambda x: int(-1*x) if x < 0 else 0)
df['MALTIME']

### Let's drop the columns with too many nulls (10% is way to much) 

In [None]:
cols_to_check = df.columns
is_null_col = dict(df[cols_to_check].isnull().apply(lambda x: x.sum() > (509608*.1) ))
null_col = [key for key in is_null_col.keys() if is_null_col[key]]
null_col

### SEQNO and RECTYPE are useless. I do not need ids or values that are singular

In [None]:
df.drop(null_col + ['SEQNO', 'RECTYPE', 'MALYEAR1', 'MALYEAR2'], axis=1,inplace=True)
df.dropna(inplace=True)
df

## Time to deal with the payments 

### Objectives
   1. Fill in TOTALPMT as PAYMENT
   2. Account for inflation in PAYMENT

### Objective 1 - Fill in TOTALPMT as PAYMENT

In [None]:
df['PAYMENT'] = df['TOTALPMT']
df.drop('TOTALPMT', axis=1,inplace=True)
df

### Objective 2 - Account for inflation in PAYMENT

#### What years are in the data frame 

In [None]:
df['ORIGYEAR'].unique()

### Inflation for each year and a function that will help with that

In [None]:
def inflation(years, amounts):
    ref_year = {
        2004: 1.66, 2005: 1.61, 2006: 1.56,
        2007: 1.51, 2008: 1.46, 2009: 1.46,
        2010: 1.44, 2011: 1.40, 2012: 1.37,
        2013: 1.35, 2014: 1.33, 2015: 1.33,
        2016: 1.31, 2017: 1.28, 2018: 1.25,
        2019: 1.23, 2020: 1.21, 2021: 1.16,
        2022: 1.07, 2023: 1.03, 2024: 1.00
    }
    return [amounts.iloc[i]*ref_year[years.iloc[i]] for i in range(len(years))]


#### Parse PAYMENT into ints

In [None]:
df['PAYMENT'] = df['PAYMENT'].map(lambda x: int(x[1:]))
df['PAYMENT']

In [None]:
df['PAYMENT'] = inflation(df['ORIGYEAR'], df['PAYMENT'])
df['PAYMENT']

### Next, are there any other useless columns

### Most of the columns categorical, so how many categories are there if it is a categorical column

In [None]:
col_ref = {}
for col in df.columns:
    col_ref[col]  = len(df[col].unique())
col_ref

### Useless Columns 

   - PRACTNUM is an id for practitioner
   - ACCRRPTS has only one unique value
   - ORIGYEAR is the year, which is irrevlent to new data. 

In [None]:
df.drop(['PRACTNUM', 'ACCRRPTS', 'ORIGYEAR'], axis=1,inplace=True)
df

### Let's rename and add columns

In [None]:
df = df.rename(columns={'REPTYPE' : 'ISINSURE', 'FUNDPYMT' : 'STATEFUND'})

### WKNLICEQ - work (WK) and (N) License (LIC) states are the same (EQ) 

In [None]:
df['ISINSURE'] = df['ISINSURE'].map({101 : 1, 102 : 0})
df.insert(3, 'WKNLICEQ', (df['WORKSTAT'] == df['LICNSTAT']).astype(int))
df

In [None]:
df.to_csv('NPDB2401_Modern_Malpractice_Clean.csv', index=False)