# 1. <a id='Introduction'>Introduction 

### This data was extracted from the 1994 Census bureau database by Ronny Kohavi and Barry Becker (Data Mining and Visualization, Silicon Graphics) has 15 columns and 32561 rows. Our goal will be to predict whether a given adult individual has an income> 50K or <= 50K per year based on the features distributed in 14 columns. This example of a supervised machine learning model.

# 2. <a id='importing'>Importing the necessary libraries

In [7]:
import pandas as pd
import numpy as np
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# Disable warnings
import warnings
warnings.filterwarnings("ignore")

# Import plotting modules
!pip install chart-studio
import seaborn as sns
sns.set()
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker
import plotly.express as px
from plotly.offline import iplot
from matplotlib import rcParams

import chart_studio.plotly as py
import plotly.graph_objs as go
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
%matplotlib inline

warnings.filterwarnings("ignore")
import plotly.figure_factory as ff
from colorama import Fore, Back, Style 

# Import encoder library
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder 

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

/kaggle/input/adult-census-income/adult.csv
Collecting chart-studio
  Using cached chart_studio-1.1.0-py3-none-any.whl (64 kB)
Installing collected packages: chart-studio
Successfully installed chart-studio-1.1.0
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


# 3. <a id='reading'>Reading the dataset.csv

In [8]:
data = pd.read_csv('../input/adult-census-income/adult.csv')

In [9]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [10]:
print(Fore.BLUE + 'Data information ....................',Style.RESET_ALL)
print(data.info())

[34mData information .................... [0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


# 4. <a id='basic'>Basic Data Exploration

### Starting by handling some data.

In [12]:
#Replace the unknow data '?'by NAN.
datas = data.replace('?', np.nan)
datas.head(4)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K


In [13]:
#Mapping binary values to the expected output

datas['income']=datas['income'].map({'<=50K': 0, '>50K': 1})

In [14]:
#Read the new dataset now
datas.head(4)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0


In [16]:
print(datas['sex'])

0    Female
1    Female
2    Female
3    Female
4    Female
Name: sex, dtype: object

# 5. <a id='details'>Data Exploration in Details

###  We can show the frequency of assignments for some columns.

In [None]:
datas['income'].value_counts()

In [None]:
datas['sex'].value_counts()

In [None]:
datas['capital.gain'].value_counts()

In [None]:
datas['occupation'].value_counts()

In [None]:
datas['race'].value_counts()

In [None]:
#identifying missing values
datas.isnull().sum()

### Plotting and analyzing some graphics 

In [None]:
# Genre distribution
datas['sex'].value_counts().iplot(kind='bar',
                                              yTitle='Counts', 
                                              linecolor='black', 
                                              opacity=0.7,
                                              color='blue',
                                              theme='pearl',
                                              bargap=0.5,
                                              gridcolor='white',
                                              title='Distribution of the genre column in the unity Set')

### More than twice as many individuals in the dataset are male

In [None]:
# Race distribution
datas['race'].value_counts().iplot(kind='bar',
                                              yTitle='Counts', 
                                              linecolor='black', 
                                              opacity=0.7,
                                              color='blue',
                                              theme='pearl',
                                              bargap=0.5,
                                              gridcolor='white',
                                              title='Distribution of the race column in the unity Set')

### The vast majority of individuals in dataset are white.

In [None]:
# Race distribution
datas['education'].value_counts().iplot(kind='bar',
                                              yTitle='Counts', 
                                              linecolor='black', 
                                              opacity=0.7,
                                              color='blue',
                                              theme='pearl',
                                              bargap=0.5,
                                              gridcolor='white',
                                              title='Distribution of the education column in the unity set')

### The most individuals have education level distribution in tre types high-school graduate, some college kind and bachelors.

In [None]:
#Age distribution over hours per week 
fig = px.scatter(datas, x= 'hours.per.week', y="age", color='sex')
fig.show()

In [None]:
fig = px.scatter(datas, x= 'capital.loss', y="age", color='sex')
fig.show()

In [None]:
fig = px.scatter(datas, x= 'capital.gain', y="age", color='sex')
fig.show()

In [None]:
#distribution of education level 'education.num' over hours per week
fig = px.scatter(datas, x= 'hours.per.week', y="education.num", color='age')
fig.show()

### According to the distribution of the data above, we see a concentration in the region that includes individuals who work 10 to 60 hours per week with an education level ranging from 4 to 12.

In [None]:
df = datas
fig = px.violin(df, y='education.num', x='marital.status', box=True, color='sex',
          hover_data=datas.columns)
fig.show()

### We can see that widowed and separated gruops show the least dispersion of the education level to both sex and are therefore more homogeneous.

In [None]:
df = datas
fig = px.violin(df, y='education.num', x='marital.status', box=True, color='income',
          hover_data=datas.columns)
fig.show()

### In the case of individuals with an income greater than 50k, they present less dispersion for the values of educational levels in any group of marital status.

In [None]:
# Ploting the relationship that the fetures of each column have with each other

corrmat = datas.corr() 
f, ax = plt.subplots(figsize =(12, 10)) 
sns.heatmap(corrmat, ax = ax, cmap = 'RdYlBu_r', linewidths = 0.5) 

In [None]:
#Since the column 'fnlwgt' has 0 correlation with others, it can be dropped

datas.drop(['fnlwgt'], axis = 1, inplace = True)


# 6. <a id='details'> Using machine learning to predict income

### Data pre-processing for machine learning algorithm

In [None]:
#Since a very small amount of data is missing, we can replace the null values with the mode of each column

datas['occupation'].describe()

In [None]:
#Since mode is Prof-specialty, replacing null values with it

datas['occupation'] = datas['occupation'].fillna('Prof-specialty')

In [None]:
datas['workclass'].describe()

In [None]:
#Since mode is Private, replacing null values with it

datas['workclass'] = datas['workclass'].fillna('Private')

In [None]:
datas['native.country'].describe()

In [None]:
#Since mode is United-States, replacing null values with it

datas['native.country'] = datas['native.country'].fillna('United-States')

In [None]:
#Splitting the dataset into features and target

X = datas.drop(['income'], axis=1)
Y = datas['income']

In [None]:
#Splitting the data into test data and training data

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [None]:
from sklearn import preprocessing


categorical = ['workclass','education', 'marital.status', 'occupation', 'relationship','race', 'sex','native.country']
for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.transform(X_test[feature])


In [None]:
#Make StandardScalar to normalise the dataset

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

### We will use five machine learning algorithms to test their efficiency as predictive models

In [None]:
accuracy_list = []

In [None]:
# Decision Tree Classifier algorithm

dt_clf = DecisionTreeClassifier(max_leaf_nodes=10, random_state=30, criterion='entropy')
dt_clf.fit(X_train, Y_train)
dt_pred = dt_clf.predict(X_test)
dt_acc = dt_clf.score(X_test,Y_test)
accuracy_list.append(100*dt_acc)

print(Fore.BLUE + "Accuracy of Decision Tree Classifier is : ", "{:.2f}%".format(100* dt_acc))

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize = (8, 8))
mat = confusion_matrix(Y_test, dt_pred)
sns.heatmap(mat.T, square=True, annot=True,fmt="d", cbar = False)
plt.title("Decision Tree Clasifier - Confusion Matrix")
plt.xticks(range(2), ["0","1"], fontsize=16)
plt.yticks(range(2), ["0","1"], fontsize=16)
plt.xlabel("true label")
plt.ylabel("predicted label");

In [None]:
# K Neighbors Classifier algorithm

kn_clf = KNeighborsClassifier(n_neighbors=6)
kn_clf.fit(X_train, Y_train)
kn_pred = kn_clf.predict(X_test)
kn_acc = kn_clf.score(X_test,Y_test)
accuracy_list.append(100*kn_acc)

print(Fore.BLUE + "Accuracy of K Neighbors Classifier is : ", "{:.2f}%".format(100* kn_acc))

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize = (8, 8))
mat = confusion_matrix(Y_test, kn_pred)
sns.heatmap(mat.T, square=True, annot=True,fmt="d", cbar = False)
plt.xlabel("true label")
plt.ylabel("predicted label")
plt.title("K Neighbors Classifier - Confusion Matrix")
plt.xticks(range(2), ["0","1"], fontsize=16)
plt.yticks(range(2), ["0","1"], fontsize=16);

In [None]:
# RandomForestClassifier algorithm
r_clf = RandomForestClassifier(max_features=0.5, max_depth=15, random_state=1)
r_clf.fit(X_train, Y_train)
r_pred = r_clf.predict(X_test)
r_acc = r_clf.score(X_test,Y_test)
accuracy_list.append(100*r_acc)

print(Fore.BLUE + "Accuracy of Random Forest Classifier is : ", "{:.2f}%".format(100* r_acc))

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize = (8, 8))
mat = confusion_matrix(Y_test, r_pred)
sns.heatmap(mat.T, square=True, annot=True,fmt="d", cbar = False)
plt.xlabel("true label")
plt.ylabel("predicted label")
plt.title("Random Forest Classifier - Confusion Matrix")
plt.xticks(range(2), ["0","1"], fontsize=16)
plt.yticks(range(2), ["0","1"], fontsize=16);

In [None]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

gradientboost_clf = GradientBoostingClassifier(max_depth=2, random_state=4)
gradientboost_clf.fit(X_train,Y_train)
gradientboost_pred = gradientboost_clf.predict(X_test)
gradientboost_acc = gradientboost_clf.score(X_test,Y_test)
accuracy_list.append(100*gradientboost_acc)

print(Fore.BLUE + "Accuracy of Gradient Boosting is : ", "{:.2f}%".format(100* gradientboost_acc))

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize = (8, 8))
mat = confusion_matrix(Y_test, gradientboost_pred)
sns.heatmap(mat.T, square=True, annot=True,fmt="d", cbar = False)
plt.xlabel("true label")
plt.ylabel("predicted label")
plt.title("Random Forest Classifier - Confusion Matrix")
plt.xticks(range(2), ["0","1"], fontsize=16)
plt.yticks(range(2), ["0","1"], fontsize=16);

In [None]:
# Logistic Regression algorithm

logreg = LogisticRegression(random_state = 4)
logreg.fit(X_train, Y_train)
logreg_pred = logreg.predict(X_test)
logreg_acc = logreg.score(X_test,Y_test)
accuracy_list.append(100*logreg_acc)

print(Fore.BLUE + "Accuracy of  Logistic Regression algorithm is : ", "{:.2f}%".format(100* logreg_acc))

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize = (8, 8))
mat = confusion_matrix(Y_test, logreg_pred)
sns.heatmap(mat.T, square=True, annot=True,fmt="d", cbar = False)
plt.xlabel("true label")
plt.ylabel("predicted label")
plt.title("Logistic Regression algorithm - Confusion Matrix")
plt.xticks(range(2), ["0","1"], fontsize=16)
plt.yticks(range(2), ["0","1"], fontsize=16);

In [None]:
model_list = ['DecisionTreeClassifier', 'KNearestNeighbours', 'RandomForest', 'GradientBooster','Logistic Regression']

In [None]:
plt.rcParams['figure.figsize']=20,8
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=accuracy_list, palette = "vlag", saturation =2.0)
plt.xlabel('Classifier Models', fontsize = 20 )
plt.ylabel('% of Accuracy', fontsize = 20)
plt.title('Accuracy of different Classifier Models', fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()

### We can see all machine laerning algorithm test have  more than 82% of accuracy, the bast algorithm was RandomForest with 86% accuracy. 