In [27]:
from ucimlrepo import fetch_ucirepo 
  
adult = fetch_ucirepo(id=2) 
  
X = adult.data.features 
y = adult.data.targets 
  
print(adult.metadata) 
  
print(adult.variables) 

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
X.isnull().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    274
dtype: int64

In [30]:
X.duplicated().sum()

np.int64(53)

In [31]:
y

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
48837,<=50K.
48838,<=50K.
48839,<=50K.
48840,<=50K.


In [32]:
X.describe().info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, count to max
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             8 non-null      float64
 1   fnlwgt          8 non-null      float64
 2   education-num   8 non-null      float64
 3   capital-gain    8 non-null      float64
 4   capital-loss    8 non-null      float64
 5   hours-per-week  8 non-null      float64
dtypes: float64(6)
memory usage: 448.0+ bytes


In [33]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [34]:
X.drop('education-num', axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop('education-num', axis=1, inplace=True)


In [35]:
categorical_columns = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']

In [36]:
dummies = pd.get_dummies(X[categorical_columns])
data = X.drop(categorical_columns, axis=1)
data = pd.concat([data, dummies], axis=1)

In [37]:
data.head()

Unnamed: 0,age,fnlwgt,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,2174,0,40,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,50,83311,0,0,13,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,38,215646,0,0,40,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
3,53,234721,0,0,40,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
4,28,338409,0,0,40,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [38]:
normalize_columns = ['age', 'fnlwgt', 'capital-gain','capital-loss','hours-per-week']

In [39]:
for column in normalize_columns:
    max_val = data[column].max()
    min_val = data[column].min()
    mean_val = data[column].mean()
    var_val = data[column].var()
    print(column + ': values=[' + str(min_val) + ',' + str(max_val) + '] , mean=' + str(mean_val) + ' , var=' + str(var_val))

age: values=[17,90] , mean=38.64358543876172 , var=187.97808266247543
fnlwgt: values=[12285,1490400] , mean=189664.13459727284 , var=11152210185.574848
capital-gain: values=[0,99999] , mean=1079.0676262233324 , var=55532588.035659194
capital-loss: values=[0,4356] , mean=87.50231358257237 , var=162412.66903295522
hours-per-week: values=[1,99] , mean=40.422382375824085 , var=153.5478850061782


In [40]:
from sklearn import preprocessing


scaler = preprocessing.StandardScaler()
data[normalize_columns] = scaler.fit_transform(data[normalize_columns])

In [41]:
y.value_counts()

income
<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: count, dtype: int64

In [42]:
y['income'] = y['income'].str.replace('.', '', regex=False)

df_cleaned = y.groupby('income', as_index=False).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income'] = y['income'].str.replace('.', '', regex=False)


In [43]:
for column in y:
    unique_values = y[column].unique().tolist()
    dic = {}
    for indx, val in enumerate(unique_values):
      dic[val]=indx
    y[column] = y[column].map(dic).astype(int)
    print(column + " done!")

income done!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[column] = y[column].map(dic).astype(int)


In [44]:
y.value_counts()

income
0         37155
1         11687
Name: count, dtype: int64

In [45]:

from sklearn.model_selection import train_test_split

x_data = data
y_labels = y

X_train, X_test, y_train, y_test = train_test_split(x_data,y_labels,test_size=0.2,shuffle=True)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(39073, 107) (39073, 1)
(9769, 107) (9769, 1)


In [46]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report,accuracy_score

gb = GradientBoostingClassifier()

gb.fit(X_train, y_train)
pred1 = gb.predict(X_test)
print(classification_report(y_test, pred1))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.88      0.95      0.91      7438
           1       0.80      0.58      0.67      2331

    accuracy                           0.86      9769
   macro avg       0.84      0.77      0.79      9769
weighted avg       0.86      0.86      0.86      9769



In [47]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
pred2 = rf.predict(X_test)

  return fit_method(estimator, *args, **kwargs)


In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test,pred2))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7438
           1       0.73      0.62      0.67      2331

    accuracy                           0.86      9769
   macro avg       0.81      0.78      0.79      9769
weighted avg       0.85      0.86      0.85      9769



In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
pred3 = knn.predict(X_test)


  return self._fit(X, y)


In [50]:
print(f"gradient boosting: {accuracy_score(y_test,pred1)}")
print(f"Random forest: {accuracy_score(y_test,pred2)}")
print(f"k neighbours: {accuracy_score(y_test,pred3)}")

gradient boosting: 0.8638550516941345
Random forest: 0.8557682464940116
k neighbours: 0.8331456648582249


##ensembling


In [51]:
from sklearn.ensemble import VotingClassifier

hard_voting = VotingClassifier(estimators=[('gb', gb),
                                            ('rf', rf),
                                             ('knn', knn)], voting='hard')
hard_voting.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [52]:
y_pred_meta = hard_voting.predict(X_test)

In [54]:
print(f'Hard: {accuracy_score(y_test, y_pred_meta)}')

Hard: 0.8594533729143208


In [55]:
soft_voting = VotingClassifier(estimators=[('gb', gb),
                                            ('rf', rf),
                                             ('knn', knn)],
                               voting='soft',
                               weights=[0.2, 2, 1])
soft_voting.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [56]:
y_pred_meta = soft_voting.predict(X_test)

In [57]:
print(f'Soft: {accuracy_score(y_test, y_pred_meta)}')

Soft: 0.8553587880028662


In [65]:
modellar=[('gb', gb),('rf', rf),('knn', knn)]

In [66]:
# Stacking
from sklearn.ensemble import StackingClassifier

stacking_cl = StackingClassifier(estimators=modellar, final_estimator=rf, cv=3)

In [67]:
print(stacking_cl.named_estimators)

{'gb': GradientBoostingClassifier(), 'rf': RandomForestClassifier(random_state=42), 'knn': KNeighborsClassifier()}


In [68]:
stacking_cl.fit(X_train, y_train)

y_pred_meta = stacking_cl.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [69]:
print(classification_report(y_test, y_pred_meta))

              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7438
           1       0.65      0.61      0.63      2331

    accuracy                           0.83      9769
   macro avg       0.76      0.75      0.76      9769
weighted avg       0.83      0.83      0.83      9769

