In [None]:
#  - Importing all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#  -> Read data set
df = pd.read_csv('census_income.csv')

In [None]:
df

We have to predict Annual income of a individual whether more than 50k or less 50k (annual_income -> Target Column)

In [None]:
# To change the column name
df.rename(columns={"annual_income": "income"},inplace = True)

In [None]:
df

In [None]:
# EDA (We need to explore the data)
df['income'].value_counts()

Most of the individuals are getting less 50K

In [None]:
df.info()

In [None]:
df['workclass'].value_counts()

In [None]:
df['native-country'].unique()

In [None]:
# Step -4 Now i going to clean the data
df.isnull().sum()

In [None]:
# We have ? that represents my null value
# We are going to replay ? with null values
df= df.replace("?",np.nan)

In [None]:
df.isnull().sum()

In [None]:
# When we have more than 15k / 20k rows then we check if 30% data is null then we can simply drop the null values
# If we have less rows 5k/10k -> i will fill my rows with central tendecies

In [None]:
df.dropna(inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Duplicate records (Simply drop it)
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
# Check the outliers
plt.boxplot(df['age'])

In [None]:
# Check for the outliers in numerical columns
for col in df.columns:
  if df[col].dtype !="object":
    plt.boxplot(df[col])
    plt.xlabel(col)
    plt.show()

In [None]:
# Ignore the outlier of capital gain and capital loss (values of money earned in stock market)

In [None]:
# make a list of columns having outlier
out_col = ['hours-per-week','age','education-num','fnlwgt']

In [None]:
for col in out_col:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1
  UF = Q3 + 1.5 * IQR
  LF = Q1 -1.5 * IQR
  df = df[(df[col]>=LF) & (df[col]<=UF)]

In [None]:
for col in df.columns:
  if df[col].dtype !="object":
    plt.boxplot(df[col])
    plt.xlabel(col)
    plt.show()

In [None]:
df.info()

In [None]:
# Step - 6 -> Label encoding -> It will convert my categorical column into numerical columns (bcz my machine understand numerical values)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()


In [None]:
# fit -> Helps model to understand the data
# transform -> Helps to change the data
# fit_transform -> Helps you to understand and change at one go
for col in df.columns:
  if df[col].dtypes=='object':
    df[col] = le.fit_transform(df[col])

In [None]:
df

We have converted all the categorical columns into numerical column

In [None]:
df.info()

# Ordinal encoding -> Assign value according to your order
* short - 0
* medium - 1
* high - 2
* tallest - 3


# Label encoding -> there is no order
* short - 3
* medium - 1


step -7 : Model building

In [None]:
x = df.iloc[:,:-1] # Independent variables
y = df['income'] # dependent variables


In [None]:
x

In [None]:
y

In [None]:
# Split the data -> Xtrain, ytrain, xtest,ytest
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

# Classifications models that we have
* logistic
* Decision tree
* random tree

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
my_model = LogisticRegression()

In [None]:
# Train my model
my_model.fit(x_train,y_train)

In [None]:
y_pred = my_model.predict(x_test)

In [None]:
y_pred

0 -> income <=50k/year

1 -> income > 50k/year

In [None]:
from sklearn.metrics import *

In [None]:
lr_acc = accuracy_score(y_test,y_pred)

In [None]:
lr_acc

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc= DecisionTreeClassifier()

In [None]:
dtc.fit(x_train,y_train)

In [None]:
dtc_pred = dtc.predict(x_test)

In [None]:
dtc_pred

In [None]:
dtc_acc = accuracy_score(y_test,dtc_pred)

In [None]:
dtc_acc

In [None]:
depth = [1,2,3,4,5,6,7,8,9,10,100]
list1=[]
for x in depth:
  dt_model = DecisionTreeClassifier(max_depth = x,random_state=12)
  dt_model.fit(x_train,y_train)
  y_pred = dt_model.predict(x_test)
  acc = accuracy_score(y_test,y_pred)
  list1.append(acc)
  print("maxDepth = " ,x," acc = ", acc)
print(max(list1))

In [None]:
# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(x_train,y_train)


In [None]:
rf_pred = rf.predict(x_test)

In [None]:
rf_pred

In [None]:
rf_acc = accuracy_score(y_test,y_pred)

In [None]:
rf_acc

In [None]:
print("ACC of LR: ", lr_acc)
print("acc of DT: ",max(list1))
print("acc of RF: ",rf_acc)

# Decision Tree is the model i am going to use for mt census income data