#### Loading Necessary Libraries
---

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

### Loading and Cleaning frame
---

In [None]:
# loading and removing initial values
missing_values = [" ?"]
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None, na_values = missing_values)

In [None]:
data.head()

# re-labeling columns as per UC Irvine ML dataset column order
columns = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex", "capital-gain",
           "capital-loss", "hours-per-week", "native-country", "income"]
data.columns = columns
data.head()

### Calculating Summary Statistics
---

#### Part I
Question 1.

In [None]:
# summary statistics of all continuous variables
for c in data.columns:
    if pd.api.types.is_any_real_numeric_dtype(data[c]):
        print(f"{c} sumary:")
        display(data[c].describe())

In [None]:
data[['education', 'education-num']].sort_values(by='education-num').drop_duplicates()

Question 2.

In [None]:
# calculating unique value frequencies for discrete features 
for c in data.columns:
    if not(pd.api.types.is_any_real_numeric_dtype(data[c])):
        print(f"{c} value unque value counts:")
        display(data[c].value_counts())

### Visualizations
---

Question 3.

In [None]:
# a bar chart is an effective way to visualize income bracket distribution
data['income'].value_counts().plot(kind='bar', xlabel='Income Bracket', ylabel="Frequency", title='Income Bracket Distribution',
                                   grid=True)
plt.show()

Question 4.

In [None]:
# gaining more insight into the distribution of age, education level and hours worked
data['age'].plot(kind='hist', xlabel='age', ylabel="frequency", title='Age Distribution',
                                   grid=True)
plt.show()
edu_bar = data['education-num'].value_counts().sort_index().plot(kind='bar', xlabel='level', ylabel="frequency", title='Education Level Distribution',
                                   grid=True, width=0.8)
edu_bar.set_xticklabels(data[['education','education-num']].sort_values(by='education-num')['education'].unique())
plt.show()
data['hours-per-week'].plot(kind='hist', xlabel='hours', ylabel="frequency", title='Hours per Week Distribution',
                                   grid=True, bins=10)
plt.show()

Question 5.

In [None]:
# lets visualize how age relates to hours worked and education level
fig, ax = plt.subplots()
fig.set_size_inches(10,6)

ax.scatter(x=data['age'], y=data['hours-per-week'])
ax.set_xlabel("age")
ax.set_ylabel("hours worked")
ax.set_title("age vs hours worked")
plt.show()


In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,6)

ax.scatter(x=data['age'], y=data['capital-gain'])
ax.set_xlabel("age")
ax.set_ylabel('capital gained (USD)')
ax.set_title("age vs capital-gain")
plt.show()


In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,6)

ax.scatter(x=data['hours-per-week'], y=data['capital-gain'])
ax.set_xlabel("hours worked")
ax.set_ylabel('capital gained (USD)')
ax.set_title("Hours Worked Per Week vs Annual Captial Gained")
plt.show()

Question 6.

In [None]:
# lets standardize some of the features and create a new dataframe so that the parallel diagram is more readable
stdzd_hrs = (data['hours-per-week']-data['hours-per-week'].mean())/data['hours-per-week'].std()
stdzd_education = (data['education-num']-data['education-num'].mean())/data['education-num'].std()
stdzd_age = (data['age']-data['age'].mean())/data['age'].std()
stdzd_data = pd.DataFrame({
    'age': stdzd_age,
    'education-num': stdzd_education,
    'hours_worked': stdzd_hrs,
    'sex': data['sex']
})

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10,6)
ax = pd.plotting.parallel_coordinates(stdzd_data, 'sex', color=['blue','red'])
ax.set_title("Male vs Female Census Metrics")
ax.set_xticklabels(['age', 'education level', 'hours worked'])
plt.show()

### Handling Missing Data
--- 

#### Part II

In [None]:
# determining features with missing values
nan_attributes = data.columns[data.isna().any()]
nan_attributes

#### Feature distribution before filling in missing values

In [None]:
# feature distributions with missing values ignored
data['workclass'].value_counts().plot(kind='bar', title='Workclass Distribution Before Filling Missing Values',
                                      xlabel='Workclass', ylabel='Frequency', figsize=(5,4), width=0.8)
plt.show()

data['occupation'].value_counts().plot(kind='bar', title='Occupation Distribution Before Filling Missing Values',
                                      xlabel='Occupation', ylabel='Frequency', figsize=(5,4), width=0.8)
plt.show()

data['native-country'].value_counts().plot(kind='bar', title='Native Country Distribution Before Filling Missing Values',
                                      xlabel='Country', ylabel='Frequency', figsize=(10,4), width=0.8)
plt.show()

#### Replacing NaN values with mode of attribute

Question 1.

In [None]:
# since all features with missing attributes are categorical, we replace nan values with the mode of the feature
workclass_mode = data['workclass'].mode()[0]
occupation_mode = data['occupation'].mode()[0]
nat_country_mode = data['native-country'].mode()[0]

nafilled_workclass = data['workclass'].fillna(workclass_mode, inplace=False)
nafilled_occupation = data['occupation'].fillna(occupation_mode, inplace=False)
nafilled_nat_country = data['native-country'].fillna(nat_country_mode, inplace=False)

# feature distributions with missing values filled 
nafilled_workclass.value_counts().plot(kind='bar', title='Workclass Distribution After Filling Missing Values with Feature Mode',
                                      xlabel='Workclass', ylabel='Frequency', figsize=(5,4), width=0.8)
plt.show()

nafilled_occupation.value_counts().plot(kind='bar', title='Occupation Distribution After Filling Missing Values with Feature Mode',
                                      xlabel='Occupation', ylabel='Frequency', figsize=(5,4), width=0.8)
plt.show()

nafilled_nat_country.value_counts().plot(kind='bar', title='Native Country Distribution After Filling Missing Values with Feature Mode',
                                      xlabel='Country', ylabel='Frequency', figsize=(10,4), width=0.8)
plt.show()

#### Replacing NaN values with mode/mean based on attribute dtype and class value of Nan instance

Question 2.

In [None]:
# this function will replace all nan values by their feature mode/mean (based on discrete/continuous features) for the class to which the nan instance
# belongs
def filna_by_class(data, class_attr):
    
    frame = data.copy(deep=True)
    nan_attributes = frame.columns[frame.isna().any()]
    class_value_dfs = {}
    
    unique_classes = frame[class_attr].unique()
    for x in unique_classes:
        class_value_dfs[x] = frame[frame[class_attr] == x]
        
    for c in nan_attributes:
        for i in frame[frame[c].isna()].index:
            i_class = frame.loc[i,'income']
            if pd.api.types.is_any_real_numeric_dtype(frame[c]):
                frame.loc[i,c] = class_value_dfs[i_class][c].mean()[0]
            else:
                frame.loc[i,c] = class_value_dfs[i_class][c].mode()[0]
    
    return frame

In [None]:
# filling missing values based on missing instance class
nonan_data = filna_by_class(data, 'income')

In [None]:
nonan_data['workclass'].value_counts().plot(kind='bar', title='Workclass Distribution After Filling Missing Values with Feature Mode\nfor relevant class',
                                      xlabel='Workclass', ylabel='Frequency', figsize=(5,4), width=0.8)
plt.show()

nonan_data['occupation'].value_counts().plot(kind='bar', title='Occupation Distribution After Filling Missing Values with Feature Mode\nfor relevant class',
                                      xlabel='Occupation', ylabel='Frequency', figsize=(5,4), width=0.8)
plt.show()

nonan_data['native-country'].value_counts().plot(kind='bar', title='Native Country Distribution After Filling Missing Values with Feature Mode\nfor relevant class',
                                      xlabel='Country', ylabel='Frequency', figsize=(10,4), width=0.8)
plt.show()