In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header = None)

In [None]:
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',skiprows =1 ,header=None)

In [None]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                'marital_status', 'occupation','relationship', 'race', 'sex', 'capital_gain',
                'capital_loss', 'hours_per_week', 'native_country', 'wage_class']

In [None]:
train_set.columns = col_labels
test_set.columns = col_labels

In [None]:
train_set.head()

In [None]:
train_set.info()

In [None]:
train_set.isnull().sum()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(train_set.isnull())

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(train_set.corr(),annot=True)

In [None]:
train_set.shape

In [None]:
test_set.shape

In [None]:
test_set.isnull().sum()

In [None]:
data = pd.concat([train_set,test_set])

In [None]:
data.isnull().sum()

In [None]:
unique_values = [data[feature].unique() for feature in data if data[feature].dtype == 'object']

In [None]:
for value in unique_values:
    print(value)

In [None]:
data = data.replace(' ?', np.nan)

In [None]:
data.isnull().sum()

__Feature Engineering__

### Work class

In [None]:
data['workclass'].unique()

In [None]:
data['workclass'].value_counts()

In [None]:
data.replace(' Without-pay', ' Never-worked',inplace=True)

In [None]:
data['workclass'].value_counts()

In [None]:
data['workclass'].isnull().sum()
data['workclass'].fillna(0,inplace=True)

In [None]:
data['workclass'].isnull().sum()

In [None]:
data['workclass'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x= data['workclass'])
plt.xticks(rotation = 90)

### Wage class

In [None]:
data['wage_class'].unique()

In [None]:
data.replace({' <=50K':0, ' >50K':1, ' <=50K.':0, ' >50K.':1},inplace=True)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x= data['wage_class'])

### Fnlwgt

In [None]:
data['fnlwgt'].values

In [None]:
data['fnlwgt'].describe()

In [None]:
data['fnlwgt'] = data['fnlwgt'].apply(lambda x: np.log1p(x))

In [None]:
data['fnlwgt'].values

### Education

In [None]:
data['education'].unique()

In [None]:
plt.figure(figsize=(8,6))
sns.catplot(x='education',y='wage_class',data=data,kind='bar')
plt.xticks(rotation=75)

In [None]:
def func_primary(x):
    if x in [' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th', ' 11th', ' 12th']:
        return 'Primary Edu.'
    else:
        return x

In [None]:
data['education'] = data['education'].apply(func_primary)

In [None]:
plt.figure(figsize=(8,6))
sns.catplot(x='education',y='wage_class',data=data,kind='bar')
plt.xticks(rotation=75)

### Marital Status

In [None]:
data['marital_status'].unique()

In [None]:
data['marital_status'].value_counts()

In [None]:
data['marital_status'].replace(' Married-civ-spouse', ' Married-AF-spouse',inplace=True)

In [None]:
data['marital_status'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
sns.catplot(x='marital_status',y='wage_class',data=data,kind='bar')
plt.xticks(rotation=75)

### Occupation

In [None]:
data['occupation'].unique()

In [None]:
data['occupation'].value_counts()

In [None]:
data['occupation'].isnull().sum()

In [None]:
data['occupation'].fillna(0,inplace=True)

In [None]:
data['occupation'].isnull().sum()

In [None]:
plt.figure(figsize=(8,6))
sns.catplot(x='occupation',y='wage_class',data=data,kind='bar')
plt.xticks(rotation=75)

In [None]:
data['occupation'].replace(' Armed-Forces', 0, inplace=True)

In [None]:
plt.figure(figsize=(8,6))
sns.catplot(x='occupation',y='wage_class',data=data,kind='bar')
plt.xticks(rotation=75)

### Relationship

In [None]:
data['relationship'].unique()

In [None]:
data['relationship'].value_counts()

### Race

In [None]:
data['race'].unique()

In [None]:
data['race'].value_counts()

### Sex

In [None]:
data['sex'].unique()

In [None]:
data['sex'].value_counts()

### Country

In [None]:
data['native_country'].unique()

In [None]:
def native_country(country):
    if country in [' United-States', ' Canada']:
        return "North_America"
    elif country in [' Puerto-Rico',' El-Salvador',' Cuba',' Jamaica',' Dominican-Republic',
                     ' Guatemala',' Haiti',' Nicaragua',' Trinadad&Tobago',' Honduras']:
        return "Central_America"
    elif country in [' Mexico',' Columbia',' Vietnam',' Peru',' Ecuador',' South',
                     ' Outlying-US(Guam-USVI-etc)']:
        return "South_America"
    elif country in [' Germany',' England',' Italy',' Poland',' Portugal',' Greece',
                     ' Yugoslavia',' France',' Ireland',' Scotland',' Hungary',
                     ' Holand-Netherlands']:
        return "Europe"
    elif country in [' India',' Iran',' China',' Japan',' Thailand',' Hong',' Cambodia',
                     ' Laos',' Philippines',' Taiwan']:
        return "Asia"
    else:
        return country

In [None]:
data['native_country'] = data['native_country'].apply(native_country)

In [None]:
plt.figure(figsize=(8,6))
sns.catplot(x='native_country',y='wage_class',data=data,kind='bar')
plt.xticks(rotation=75)

__Train and Test data__

In [None]:
X = data.iloc[:,:-1]

In [None]:
X.head()

In [None]:
y = data.iloc[:,-1]

In [None]:
y.head()

In [None]:
X = pd.get_dummies(X)

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled_data = scaler.fit_transform(X)

In [None]:
X_scaled_data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled_data, y, test_size=0.33, random_state=100)

In [None]:
print("Train:",X_train.shape)
print("Test:",X_test.shape)

In [None]:
pip install xgboost

__Training Model__

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_predict = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
print('Accuracy Score: {}\n'.format(accuracy_score(y_test, y_predict)))
print('Confusion Matrix:\n {}\n'.format(confusion_matrix(y_test, y_predict)))
print('Classification Report: {}'.format(classification_report(y_test,y_predict)))

__Hyperparameter Tuning__

In [None]:
params = {'max_depth':[3, 5, 7], 'min_child_weight': [1], 'eta':[.3, .1, .05, .005],
          'subsample': [i/10 for i in range(7,9)],
          'colsample_bytree': [i/10 for i in range(7,9)], 'n_estimators':[10, 50, 100, 200],
          'learning_rate':[0.01, 0.001]}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(classifier, params, scoring = 'accuracy', cv = 5, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
classifier_hyper = XGBClassifier(colsample_bytree= 0.7, eta= 0.3,
                                 learning_rate= 0.01, max_depth=7,
                                    min_child_weight=1, n_estimators= 200,
                                     subsample= 0.8)

In [None]:
classifier_hyper.fit(X_train, y_train)

In [None]:
y_predict_hyper = classifier_hyper.predict(X_test)

In [None]:
print('Accuracy Score: {}\n'.format(accuracy_score(y_test, y_predict_hyper)))
print('Confusion Matrix:\n {}\n'.format(confusion_matrix(y_test, y_predict_hyper)))
print('Classification Report: {}'.format(classification_report(y_test,y_predict_hyper)))