## ACS Data for ML - 


### "Adult"
 One of the most used default dataset is a 1994 ACS dataset use to predict whether an adult earned more than $50k per year

 [Adult ACS  Dataset on UCI Repository ](https://archive.ics.uci.edu/dataset/2/adult)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [None]:
try: from ucimlrepo import fetch_ucirepo 
except ImportError: 
    !pip install ucimlrepo
    from ucimlrepo import fetch_ucirepo


In [None]:
# load adult dataset
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
  
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 


In [None]:
#inspect the data for features

adult.data.features.head()

In [None]:
#or alternatively
# url for adult dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
# column names for the dataset
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
adultuci= pd.read_csv(url, names=column_names, sep=', ', engine='python')
adultuci.head()

In [None]:

# Replace '?' with NaN
adultuci.replace('?', np.nan, inplace=True)

# Drop rows with missing values
print("Shape before drop ",  adultuci.shape)
adultuci.dropna(inplace=True)
print("Shape after drop ",  adultuci.shape)


In [None]:
#one hot encoding for categorical variables
adultuci = pd.get_dummies(adultuci, drop_first=True)
adultuci.head()

In [None]:
#assigning the target variable
y = adultuci['income_>50K']
X = adultuci.drop('income_>50K', axis=1)


## Aside on mising values   
`Simple Imputer` from `sklearn`
other options are: 

mean: Replaces missing values using the mean of the column. This strategy is only applicable to numerical data.

median: Replaces missing values using the median of the column. This strategy can be more robust than the mean, as it is less affected by outliers and is applicable to numerical data.

most_frequent: Replaces missing values using the mode (the most frequent value) of the column. This strategy can be used with both numerical and categorical (including string or object) data.

constant: Replaces missing values with a constant value that you specify through the fill_value parameter. This strategy can be used with both numerical and categorical data.

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

In [None]:
# create a logistic regression model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

yhat = model.predict(X_test)

In [None]:
# calculate confusion matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, yhat)
cm

In [None]:
cm_df = pd.DataFrame(cm, 
                     index=["Actual Negative", "Actual Positive"], 
                     columns=["Predicted Negative", "Predicted Positive"])

print(cm_df)

In [None]:
plt.figure(figsize=(7, 7))
sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues", annot_kws={"size": 16})
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
# calculate accuracy
from sklearn.metrics import accuracy_score
print ("Accuracy Score", accuracy_score(y_test, yhat))
# calculate precision
from sklearn.metrics import precision_score
print ("Precision Score", precision_score(y_test, yhat))
# calculate recall
from sklearn.metrics import recall_score
print ("Recall Score", recall_score(y_test, yhat))
# calculate F1 score
from sklearn.metrics import f1_score
print ("F1 Score", f1_score(y_test, yhat))


In [None]:
# get the coefficients of the model
#add the sd of the coefficients
#add the absolute value of the coefficients
#sort the values by the absolute value of the coefficients

coefficients = model.named_steps['logisticregression'].coef_[0]


coefficients_df = pd.DataFrame({
    'Variable': X.columns,
    'Coefficient': coefficients,
    'Standard Deviation': np.std(X_train, 0) * coefficients,
    "Absolute Coefficient": np.abs(coefficients)
})
coefficients_df = coefficients_df.sort_values('Absolute Coefficient', ascending=False)
coefficients_df

