<a href="https://colab.research.google.com/github/Soul2018/DeepLearning/blob/main/DeepLearning_with_keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [216]:
!pip install --upgrade scikit-learn
!pip install --upgrade scikeras



In [217]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import RocCurveDisplay
from keras.layers import Input
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from sklearn.pipeline import Pipeline

# Download the dataset and load it into a pandas DataFrame

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

# Display the first few rows of the DataFrame
print(df.head(5))

   39          State-gov   77516   Bachelors   13        Never-married  \
0  50   Self-emp-not-inc   83311   Bachelors   13   Married-civ-spouse   
1  38            Private  215646     HS-grad    9             Divorced   
2  53            Private  234721        11th    7   Married-civ-spouse   
3  28            Private  338409   Bachelors   13   Married-civ-spouse   
4  37            Private  284582     Masters   14   Married-civ-spouse   

         Adm-clerical   Not-in-family   White     Male   2174   0   40  \
0     Exec-managerial         Husband   White     Male      0   0   13   
1   Handlers-cleaners   Not-in-family   White     Male      0   0   40   
2   Handlers-cleaners         Husband   Black     Male      0   0   40   
3      Prof-specialty            Wife   Black   Female      0   0   40   
4     Exec-managerial            Wife   White   Female      0   0   40   

    United-States   <=50K  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K 

In [218]:
# Do some exploratory analysis. How many rows/columns are there? How are NULL
# values represented? What's the percentrage of positive cases in the dataset?

print(df.shape)

print(df.isnull().sum())

#let's name the column target as income

df.rename(columns = {df.columns[-1]: 'income'}, inplace = True)

positive_cases = df[df['income'] == '<=50K'].shape[0]
total_cases = df.shape[0]
percentage_positive = (positive_cases/total_cases) * 100
print(f"Percentage of positive cases: {percentage_positive}%")

(32560, 15)
39                0
 State-gov        0
 77516            0
 Bachelors        0
 13               0
 Never-married    0
 Adm-clerical     0
 Not-in-family    0
 White            0
 Male             0
 2174             0
 0                0
 40               0
 United-States    0
 <=50K            0
dtype: int64
Percentage of positive cases: 0.0%


In [219]:
# Find all NULL values and drop them
df.dropna(inplace=True)


In [220]:
# Use Scikit-Learn's LabelEncoder to convert the income column with a data type
# string to a binary variable.

label_encoder = LabelEncoder()
df['income'] = label_encoder.fit_transform(df['income'])

print(df.head(20))

    39          State-gov   77516      Bachelors   13           Never-married  \
0   50   Self-emp-not-inc   83311      Bachelors   13      Married-civ-spouse   
1   38            Private  215646        HS-grad    9                Divorced   
2   53            Private  234721           11th    7      Married-civ-spouse   
3   28            Private  338409      Bachelors   13      Married-civ-spouse   
4   37            Private  284582        Masters   14      Married-civ-spouse   
5   49            Private  160187            9th    5   Married-spouse-absent   
6   52   Self-emp-not-inc  209642        HS-grad    9      Married-civ-spouse   
7   31            Private   45781        Masters   14           Never-married   
8   42            Private  159449      Bachelors   13      Married-civ-spouse   
9   37            Private  280464   Some-college   10      Married-civ-spouse   
10  30          State-gov  141297      Bachelors   13      Married-civ-spouse   
11  23            Private  1

In [221]:
# Split dataset into training and test sets

X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1), df['income'], test_size=0.2, random_state=42)

In [222]:
# Use Scikit-Learn's roc_auc_score to calculate the AUC score for a method that
# always predicts the majority class.

majority_class = y_train.value_counts().idxmax()
y_pred_majority = [majority_class] * len(y_test)
auc_score = roc_auc_score(y_test, y_pred_majority)

print(f"AUC score for always predicting the majority class: {auc_score}")

AUC score for always predicting the majority class: 0.5


In [223]:
# Use Scikit-Learn's ColumnTransformer to apply One Hot Encoding to the
# categorical variables in workclass, education, marital-status, occupation,
# relationship, 'race', sex, and native-country.
#Also, apply MinMaxScaler to the remaining continuous features.

#let's name non-contiguous columns first
new_columns_name = {0: 'age', 1: 'workclass', 3: 'education', 5: 'marital-status', 6: 'occupation', 7: 'relationship', 8: 'race', 10: 'sex',
                    12: 'hours-per-week', 13: 'native-country'}

df.columns = [new_columns_name.get(i, col) for i, col in enumerate(df.columns)]

categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

continous_features = ['age', 'hours-per-week']

#let's make sure that the data set is re-trained
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

# Ensure X_train has the expected column names
# It's possible column names have extra spaces - let's strip those off
X_train.columns = X_train.columns.str.strip()

# let's create the ColumnTransformer
preprocessorValue = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('cont', MinMaxScaler(), continous_features)
    ])

transformed_data = preprocessorValue.fit_transform(X_train)

# let's get feature names
feature_names = preprocessorValue.get_feature_names_out()

transformed_df = pd.DataFrame(transformed_data, columns=feature_names, index=X_train.index)

print(transformed_df.head(5))


       cat__workclass_ ?  cat__workclass_ Federal-gov  \
24639                0.0                          0.0   
5514                 0.0                          0.0   
19777                0.0                          0.0   
10781                0.0                          0.0   
32239                0.0                          0.0   

       cat__workclass_ Local-gov  cat__workclass_ Never-worked  \
24639                        0.0                           0.0   
5514                         0.0                           0.0   
19777                        0.0                           0.0   
10781                        0.0                           0.0   
32239                        0.0                           0.0   

       cat__workclass_ Private  cat__workclass_ Self-emp-inc  \
24639                      1.0                           0.0   
5514                       0.0                           0.0   
19777                      1.0                           0.0   
1078

In [224]:
# How many columns will the dataframe have after these columns transformations are applied?

column_counts = transformed_df.shape.__getitem__(1)
print(column_counts)


217


In [225]:
# Define the Keras model

current_model = Sequential()
current_model.add(Input(shape=(column_counts,)))
current_model.add(Dense(units=64, activation='relu'))
current_model.add(Dense(units=1, activation='sigmoid'))

# Compile the model
current_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [226]:
# Create a Keras classifier with Keras classifier wrapper

keras_classifier_wraaper = KerasClassifier(current_model, epochs=10, batch_size=32)

In [227]:
# Create the scikit-learn pipeline

pipeline_keras = Pipeline([
    ('preprocessor', preprocessorValue),
    ('classifier', keras_classifier_wraaper)
])


In [228]:
# Fit the pipeline on the training data

pipeline_keras.fit(X_train, y_train)

Epoch 1/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8118 - loss: 0.4039
Epoch 2/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8603 - loss: 0.3089
Epoch 3/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8598 - loss: 0.3007
Epoch 4/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8628 - loss: 0.2972
Epoch 5/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8662 - loss: 0.2932
Epoch 6/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8656 - loss: 0.2929
Epoch 7/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8677 - loss: 0.2879
Epoch 8/10
[1m814/814[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8666 - loss: 0.2856
Epoch 9/10
[1m814/814[0m [32m━━━━━━━━

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['workclass', 'education',
                                                   'marital-status',
                                                   'occupation', 'relationship',
                                                   'race', 'sex',
                                                   'native-country']),
                                                 ('cont', MinMaxScaler(),
                                                  ['age', 'hours-per-week'])])),
                ('classifier',
                 KerasClassifier(batch_size=32, epochs=10, model=<Sequential name=sequential_27, built=True>))])