In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import itertools
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler


In [2]:
# Provide column names as they are not specified in the file
column_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
    'marital_status', 'occupation', 'relationship', 'race', 'sex',
    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'label'
]

# Read the file with specified column names and specify that the age-information is not(!) the index
df = pd.read_csv("adult.data", header=None, names=column_names, index_col = False)

# Take a first look at the data
display(df.head(15))
print(df.shape)
display(df.info())

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


(32561, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  label           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


None

In [3]:
# The summary doesn't show missing values (all Non-Null Counts are equal to the number of rows given in .shape. 
# But from .head() we can see there are entries featuring '?'. Let's encode them with NaN for further analysis.
df.replace(['?', ' ?', '? ', '  ?  '], np.nan, inplace=True)

# See if there are missing values now. 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  31978 non-null  object
 14  label           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
# now we see that in the columns 'workclass', 'occupation', 'native-country' there are missing values. 
# as we don't want to throw away approx. 2000 data points, we can replace the missing values with the most common value (mode) as all three are categorical data
for col in ['workclass', 'occupation', 'native_country']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# lets check again for missing values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
label             0
dtype: int64

In [5]:
#Preprocessing




# split the df in the data and target label
df_target = df['label']
df_data = df.drop(columns = 'label')

# Preprocess the label - Using LabelEncoder
label_encoder = LabelEncoder()
df_target = label_encoder.fit_transform(df_target)
label_names = ['<=50k', ' >50k']
label_order=[0,1]

# classify the data features (data-type)
numeric_features = ['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week', 'education_num']
# we leave out the feature education - reason will follow 
categorical_features = ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

''' Education would be a features we want to encode ordinally as the differences make a real sense
(bigger difference between Preschool and Masters as between Bachelor and Masters).
But there is a feature called 'education-num' that may already do that. Lets investigate further.'''

# verify if there is only one specific 'education-num'-value for each unique value of education
unique_combinations_count = df.groupby(['education', 'education_num']).size().reset_index(name='Count')
print(unique_combinations_count)

''' As we can see in the table below, there is just one combination for each of the two values.
Also the numeration is in the correct order (least education (1 - Preschool) to most education
(16 - Doctorate). So we don't need to encode it manually and drop the 'education' column due to 
redundant data.'''

df_data = df_data.drop(columns = 'education')

# now we can create our train and test split
data_train, data_test, target_train, target_test = train_test_split(
    df_data, df_target, test_size=0.2, random_state=42, stratify=df_target)





# now we can define our pipeline 

# first define a columnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features), 
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# now pipeline with ColumnTransformer, Balancer (RandomOverSampler), and estimator - just an example!
pipeline = Pipeline([
    ('preprocessing', preprocessor), 
    ('balancing', RandomOverSampler()), 
    ('estimator', None)
])

# now create Folds for Cross-Validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)



        education  education_num  Count
0            10th              6    933
1            11th              7   1175
2            12th              8    433
3         1st-4th              2    168
4         5th-6th              3    333
5         7th-8th              4    646
6             9th              5    514
7      Assoc-acdm             12   1067
8       Assoc-voc             11   1382
9       Bachelors             13   5355
10      Doctorate             16    413
11        HS-grad              9  10501
12        Masters             14   1723
13      Preschool              1     51
14    Prof-school             15    576
15   Some-college             10   7291


In [8]:
# after preprocessing and prep, get a first impression on the performance of the classifiers

# as scorer we define accuracy
scoring_metric = 'accuracy'

# we want to test the models from the exercises with their basic parameters

# Create classifiers and save them in a list

cnb_classifier = ComplementNB()
cnb_classifier.fit(data_train, target_train)

# Predict on the test set
y_pred = cnb_classifier.predict(data_test)

# Evaluate the classifier
accuracy = accuracy_score(target_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print(classification_report(target_test, y_pred))

ValueError: could not convert string to float: ' Private'