In [1]:
%%capture
import numpy as np
import pandas as pd
df=pd.read_csv('income_evaluation.csv') 
pd.concat ([df.head(3),df.tail(3)])

In [None]:
df.drop_duplicates(inplace=True)
df.rename(columns=lambda x: x.lstrip(),inplace=True)

numeric_columns=df.select_dtypes(include="number").columns.tolist()
categorical_columns=df.select_dtypes(include="object").columns.tolist()
df[categorical_columns] = df.apply(lambda x: x[categorical_columns].str.lstrip(),axis=1)


def del_zero():
    for column in df.columns:
        if (df[column]==0).sum()>10000:
            df.drop([column],axis=1,inplace=True)
del_zero()
df.drop(['education','fnlwgt','occupation'],axis=1,inplace=True)


#marital status
replace_values = {'Never-married' : 'Single', 'Divorced' : 'Single', 'Married-spouse-absent' : 'Single', 'Separated':
                  'Single','Widowed': 'Single', 'Married-civ-spouse': 'Married', 'Married-AF-spouse': 'Married'}                                                                                          
df = df.replace({'marital-status': replace_values})

#race
replace_values = {'Asian-Pac-Islander' : 'Other', 'Amer-Indian-Eskimo' : 'Other', 'Other' : 'Other'}                                                                                          
df = df.replace({'race': replace_values})

#relationship
replace_values = {'Not-in-family' : 'Single', 'Own-child' : 'Single', 'Unmarried' : 'Single','Other-relative' : 'Single'}                                                                                          
df = df.replace({'relationship': replace_values})

#workclass
replace_values = {'State-gov' : 'Gov', 'Federal-gov' : 'Gov','Local-gov' : 'Gov','Self-emp-not-inc': 'Self-employed',
                 'Self-emp-inc': 'Self-employed', 'Private': 'Self-employed', 'Without-pay': 'Jobless','Never-worked'
                  : 'Jobless','?':np.NaN}                                                                                          
df = df.replace({'workclass': replace_values})

#country
def chang_country(cols):
    north_america=["Canada", "Cuba", "Dominican-Republic", "El-Salvador", "Guatemala", "Haiti", "Honduras", "Jamaica", "Mexico",
               "Nicaragua", "Outlying-US(Guam-USVI-etc)", "Puerto-Rico", "Trinadad&Tobago", "United-States"]
    asia=["Cambodia", "China", "Hong", "India", "Iran", "Japan", "Laos", "Philippines", "Taiwan", "Thailand", "Vietnam"]
    south_america=["Columbia", "Ecuador", "Peru"]
    europe=["England", "France", "Germany", "Greece", "Holand-Netherlands", "Hungary", "Ireland", "Italy", "Poland",
            "Portugal", "Scotland", "Yugoslavia"]
    other=["South"]
    country=cols[0]
    if country in north_america:
        return 'North America'
    elif country in asia:
        return 'Asia'
    elif country in south_america:
        return 'South America'
    elif country in europe:
        return 'Europe'
    elif country in other:
        return 'Other'
    else:
        return np.NaN
df['native-country'] = df[['native-country']].apply(chang_country,axis=1)

df

In [3]:
from warnings import filterwarnings
filterwarnings("ignore")
%config IPCompleter.use_jedi=False

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
import pipename as pn
from sklearn.metrics import mean_squared_error,classification_report, confusion_matrix

In [21]:
X=df.drop('income',axis=1)
y=pd.DataFrame(df['income'])
y=pd.get_dummies(y,drop_first=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42,stratify=y)

In [None]:
num=X.select_dtypes(include="number").columns
cat=X.select_dtypes(include="object").columns
print(num,'\n',cat)

In [16]:
numerical_Pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()) ])

categorical_Pipeline = Pipeline(steps=[
    ( 'imputer', SimpleImputer(strategy='most_frequent',missing_values=np.NaN) ),
    ( 'encoder', OneHotEncoder(handle_unknown='ignore',drop= 'first') ) ])

preprocessor = ColumnTransformer(
   [ ('categorical', categorical_Pipeline, cat),
    ('numerical', numerical_Pipeline, num)    ])

pipe = Pipeline(steps=[ ('preprocessor', preprocessor) ])

X_train = pipe.fit_transform(X_train)
X_test = pipe.transform(X_test)

In [None]:
columns=pn.get_feature_names(preprocessor)

X_train_final=pd.DataFrame(X_train,columns=columns)
X_test_final=pd.DataFrame(X_test,columns=columns)
X_train_final.head()

In [24]:
knn = KNeighborsRegressor()
knn.fit(X_train_final, y_train)
y_pred = knn.predict(X_test_final)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse

0.3653110475229669

In [35]:
param_grid = { 'C':[1,10],'kernel':('linear','rbf')     }

grid = GridSearchCV(estimator=SVC(random_state=0), param_grid=param_grid, 
                    scoring="neg_mean_squared_error",cv=5)
grid.fit(X_train_final, y_train)

print(np.sqrt(-grid.best_score_))
print(grid.best_estimator_)

0.41965957663735826
SVC(C=10, random_state=0)


In [37]:
pipe = Pipeline(
    steps=[  ('preprocessor', preprocessor), 
             ("regressor", RandomForestRegressor())  ] )

with_mean=False
param_grid = {
    "regressor": [
        KNeighborsRegressor(),
        LinearRegression(),
        LogisticRegression(random_state=42),
        RandomForestRegressor(random_state=42),
        DecisionTreeRegressor(random_state=42),
        XGBRegressor(random_state=42) ], }

grid = GridSearchCV(
    pipe,
    param_grid=param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    return_train_score=True,)

grid.fit(X_train, y_train)
print(np.sqrt(-grid.best_score_))
print(grid.best_estimator_)

result = pd.DataFrame(grid.cv_results_)
result

0.3481669543135608
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  Index(['workclass', 'marital-status', 'relationship', 'race', 'sex',
       'native-country'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('scaler'...
                              gamma=0, gpu_id=-1, grow_policy='depthwise',
                           

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.376774,0.046467,0.511467,0.183204,KNeighborsRegressor(),{'regressor': KNeighborsRegressor()},-0.139398,-0.143772,-0.136097,-0.124689,...,-0.136922,0.006592,3,-0.09721,-0.097196,-0.098812,-0.100395,-0.097289,-0.09818,0.001266
1,0.181475,0.02755,0.036328,0.004737,LinearRegression(),{'regressor': LinearRegression()},-0.128303,-0.130448,-0.122904,-0.120879,...,-0.126637,0.004012,2,-0.12601,-0.125458,-0.127357,-0.12786,-0.125438,-0.126425,0.001001
2,0.655455,0.14483,0.04831,0.015383,LogisticRegression(random_state=42),{'regressor': LogisticRegression(random_state=...,-0.180022,-0.186389,-0.174753,-0.169045,...,-0.179232,0.006645,5,-0.178595,-0.177003,-0.179638,-0.181504,-0.176839,-0.178716,0.001737
3,5.464114,1.008651,0.210532,0.048668,RandomForestRegressor(random_state=42),{'regressor': RandomForestRegressor(random_sta...,-0.143676,-0.14629,-0.138197,-0.131335,...,-0.139785,0.005128,4,-0.056965,-0.05666,-0.057908,-0.058886,-0.05844,-0.057772,0.000848
4,0.281199,0.03946,0.044295,0.010546,DecisionTreeRegressor(random_state=42),{'regressor': DecisionTreeRegressor(random_sta...,-0.192014,-0.190947,-0.185722,-0.174743,...,-0.185503,0.006162,6,-0.048391,-0.048092,-0.049198,-0.05011,-0.050038,-0.049166,0.000825
5,1.329128,0.07214,0.049177,0.004243,"XGBRegressor(base_score=None, booster=None, ca...","{'regressor': XGBRegressor(base_score=None, bo...",-0.122433,-0.125913,-0.119466,-0.113643,...,-0.12122,0.004375,1,-0.090546,-0.089793,-0.090914,-0.09112,-0.090731,-0.090621,0.000455
