In [1]:
#importing necessary modules and libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Loading train and test features

In [2]:
#reading trainFeature and trainLabel csv files having feature and target
df1 = pd.read_csv('trainFeatures.csv', sep=',\s+', delimiter=',', encoding="utf-8", skipinitialspace=True)
df2 = pd.read_csv('trainLabels.csv', sep=',\s,', delimiter=',', encoding="utf-8", skipinitialspace=True)
#joing two dataframes
df_train = df1.join(df2,how="inner")
#renaming the last column to salary
df_train=df_train.rename(columns = {'0':'income'})
#reading the testFeature csv
df_test = pd.read_csv('testFeatures.csv', sep=',\s+', delimiter=',', encoding="utf-8", skipinitialspace=True)

## Data Preprocessing

In [3]:
#dealing with missing value
#finding those columns which have missing values
def Identifying_miising_val(df):
    for col in df.columns:
        if df[col].dtype == object:
            x= print(col, (df[col].str.contains('\?') == True).sum())
    return x
print("Missing values in training data\n ")
Identifying_miising_val(df_train)
print('\n')
print("Missing values in test data\n ")
Identifying_miising_val(df_test) 

Missing values in training data
 
workclass 1950
education 0
Marital-status 0
occupation 1960
relationship 0
race 0
sex 0
native-country 589


Missing values in test data
 
workclass 849
education 0
Marital-status 0
occupation 849
relationship 0
race 0
sex 0
native-country 268


In [4]:
#selecting those columns with '?'
columns_with_na_vals = ['workclass', 'occupation', 'native-country']

In [5]:
#replacing '?' with NaN
def replace_miss_val(df):
    df.replace('?', np.NaN, inplace=True)
    
replace_miss_val(df_train)
replace_miss_val(df_test)


In [6]:
#replacing NaN with values and performing feature engineering
def replace_NaN(df):
    # replacing NaN values with most occuring values for workclass, occupation , native-country
    df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
    df['native-country'] = df['native-country'].fillna(df['native-country'].mode()[0])
    df['occupation'] = df['occupation'].fillna(df['occupation'].mode()[0])
    df["workclass"].fillna("Private", inplace=True)
    # replacing other countries in native-country other than United-States as Non-USA
    df.loc[df['native-country']!= "United-States","native-country"]= "Non-USA"
    # replacing races other than white as Non-White
    df.loc[df['race']!= "White","race"]= "Non-Whtite"
    # replacing values in workclass with age < 18 as 'Never-worked'
    df.loc[df['age'] < 18,'workclass'] = 'Never-worked'
    # replacing values of Marital-status feature as single and married accordingly
    #df['Marital-status'] = df["Marital-status"].replace({'Never-married': 'Single', 'Divorced': 'Single', 
                                                 'Separated' : 'Single','Married-civ-spouse' : "Married"
                                                 ,'Married-spouse-absent': 'Married',
                                                 'Widowed' :'Single','Married-AF-spouse': 'Married'})
    
replace_NaN(df_train)
replace_NaN(df_test)


In [7]:
#Dropping unwanted features and removing duplicates
def feature_selection_duplicate_remove(df):
    df.drop(columns=['education'], inplace = True)
    df.loc[df.duplicated(keep = "first")]
    df.drop_duplicates(keep = "first", inplace= True)
feature_selection_duplicate_remove(df_train)  
#removing the unwanted features from the test set
df_test.drop(columns = ['education'], inplace = True)

In [8]:
#one hot encoding the train and test vales
df_train = pd.get_dummies(df_train, drop_first= True)
df_test = pd.get_dummies(df_test, drop_first= True)

In [9]:
#splitting of feature and label of train data
train_feature = df_train.drop("income", axis= 1)
train_label = df_train["income"]

In [10]:
X = train_feature.values
Y = train_label.values

In [11]:
#test data
test_feature = df_test.iloc[:,:]

## Implementing ensemble learning

In [12]:
accuracy_mean = []
for train, test in KFold(n_splits=10).split(X):
    x, x_cv = X[train], X[test]
    y, y_cv = Y[train], Y[test]
    RF = RandomForestClassifier(random_state =1, criterion='entropy')
    RF.fit(x, y)
    labels_predict = RF.predict(x_cv)
    accuracy = accuracy_score(y_cv, labels_predict.round())
    accuracy_percentage= accuracy*100
    accuracy_mean.append(accuracy_percentage)



In [13]:
print(sum(accuracy_mean)/len(accuracy_mean))


73.00300484764496


In [14]:
#predicting the target value for test data
y = RF.predict(test_feature.values)
print(len(y))
for i in y:
     print(i)

14653
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0


0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
0
0
0
0
0
0
0
0
1
0
0
0
1
1
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
1
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [15]:
predictions = pd.DataFrame(y).to_csv('A2_rpv_20544313_prediction.csv',index= False, header= False)