# Imports

In [None]:
import pandas as pd
import lexis_functions as lf
import datetime as dt

from scipy.sparse.csgraph import connected_components

# Read in / Clean Data

- 5,391 participants
- 4,995 (92%) report no jobs

In [None]:
df = pd.read_csv(r"C:\Users\jc4673\Documents\CHS_Lexis\LexisNexis\LN_NOID_DATA\original\\
LN_Output_Employment_LN_InputLexisNexisCHSParticipantsNS.Dataset.csv",
                index_col='ssn_altkey')

In [None]:
#Change column naming convention to match the other files, with chronological number at the end
cols = df.columns.tolist()
for i, colname in enumerate(cols):
    if 'pawk' in colname:
        shift = colname[:4] + colname[6:] + colname[4:6]
        cols[i] = shift

df_time = df.copy()
df_time.columns = cols

In [None]:
len(df_time.iloc[:, 4:].dropna(how='all'))

In [None]:
# Total number of participants
len(df_time)

In [None]:
# Number of participants with no job
len(df_time) - len(df_time.iloc[:, 4:].dropna(how='all'))

# Intervals of Employment
lex_employmentstatus

In [None]:
df_time.head()

In [None]:
my_cols = ['yrdeath'] + [col for col in df_time.columns if '_seen' in col]
df_time = df_time[my_cols].reset_index(drop=False)
#Duplicates present for some reason
df_time.drop_duplicates(inplace=True, subset='ssn_altkey')

In [None]:
df_long = pd.wide_to_long(df_time, ['pawk_last_seen_', 'pawk_first_seen_'], i='ssn_altkey', j='num')
df_long = df_long.sort_index().dropna(subset=['pawk_last_seen_', 'pawk_first_seen_'])
df_long.columns = ['death', 'last_seen_date', 'first_seen_date']

In [None]:
# Cleaning - convert dates, drop duplicates, and drop records where last_seen_date == first_seen_date
df_long = lf.convert_all_dates(df_long).drop_duplicates()
df_long = df_long[df_long['last_seen_date'] - df_long['first_seen_date'] != dt.timedelta(days=0)]

In [None]:
df_long

In [None]:
def reductionFunction(data):
    """A function """
    #data.reset_index(drop=False, inplace=True)
    # create a 2D graph of connectivity between date ranges
    start = data.first_seen_date.values
    end = data.last_seen_date.values
    graph = (start <= end[:, None]) & (end >= start[:, None])

    # find connected components in this graph
    n_components, indices = connected_components(graph)

    # group the results by these connected components
    return data.groupby(indices).aggregate({'first_seen_date': 'min',
                                            'last_seen_date': 'max',
                                            'num': 'first'})

In [None]:
df_long_reduced = df_long.reset_index(drop=False).groupby('ssn_altkey').apply(lambda x :reductionFunction(x))
del df_long_reduced['num']

In [None]:
# Number of participants that have dates connected to them
len(set(df_long_reduced.index.get_level_values(0).tolist()))

### Date Consistency

93 of the 396 persons who report jobs (46%) report dates for those jobs.  This corresponds to 1.7% of total participants

### Write to a CSV

In [None]:
df_long_reduced.to_csv(r"C:\Users\jc4673\Documents\CHS_Lexis\LexisNexis\LN_NOID_DATA\derived\employment_range_derived.csv")

# Total Number of Jobs
lex_numberofjobs_c

In [None]:
df.columns = cols
g1 = df[[x for x in df.columns.tolist() if '1' in x]]
g2 = df[[x for x in df.columns.tolist() if '2' in x]]
g3 = df[[x for x in df.columns.tolist() if '3' in x]]
g4 = df[[x for x in df.columns.tolist() if '4' in x]]
g5 = df[[x for x in df.columns.tolist() if '5' in x]]

all_subs = [g1, g2, g3, g4, g5]

In [None]:
# Find whether any values exist for the 1st-5th job
final = pd.DataFrame()
for sub in all_subs:
    a = sub.apply(lambda x: x.any(), axis=1)
    a[a == False] = 0
    a[a != 0] = 1
    final = pd.concat([final, a.to_frame()], axis=1)

In [None]:
final_sum = final.apply(sum, axis=1)
final_sum.rename('lex_numberofjobs_c', inplace=True)
final_sum = final_sum.to_frame()

### Write to CSV

In [None]:
final_sum.to_csv(r"C:\Users\jc4673\Documents\LexisNexis\LN_NOID_DATA\derived\number_of_jobs.csv")