In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import math
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import clear_output
from sklearn import model_selection,linear_model,metrics

# Model Evaluation Metrics
from sklearn.metrics import classification_report, accuracy_score, precision_score,recall_score,f1_score
from sklearn import preprocessing

#Model Selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV ,StratifiedKFold
from sklearn.linear_model  import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (AdaBoostClassifier , GradientBoostingClassifier, AdaBoostClassifier ,
                             RandomForestClassifier,RandomForestRegressor, BaggingClassifier)
import datetime
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [None]:
data = pd.read_csv("/kaggle/input/adult-census-income/adult.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data[data == '?'] = np.nan
# workclass
# occupation

In [None]:
for each in data.columns:
    print('Percent of null values',each,':',data[each].isnull().mean()*100)

In [None]:
for col in ['workclass', 'occupation', 'native.country']:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [None]:
temp = data[data["native.country"] == "South"].index


In [None]:
data.drop(temp,axis=0,inplace = True)



In [None]:
categorical_variables = [feature for feature in data.columns if data[feature].dtype in ['O','bool_']]
print('Number of categorical variables =>',len(categorical_variables),'\nCategorical Variables=>',categorical_variables)

In [None]:
numerical_variables = [feature for feature in data.columns if data[feature].dtype in ['int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
                               'uint32', 'uint64','float_', 'float16', 'float32','float64']]
print('Number of numerical variables =>',len(numerical_variables),'\nNumerical Variables=>',numerical_variables)

In [None]:
# Number of categories in categorical variables
total =0
for feature in categorical_variables:
    print(feature,'=>',data[feature].nunique())
    total += data[feature].nunique()
print('Total category:',total)

In [None]:
sns.pairplot(data, hue = "income")

In [None]:
mor_palette = sns.cubehelix_palette(start=2.8, rot=0.1, dark=0.3, light=0.8, reverse=True)

fields=data.select_dtypes(exclude="number").columns

figuresize=(16,14)
cols=3
rows=math.ceil(len(fields)/cols)
#print()

plt.subplots(rows,cols,figsize=figuresize)
for i in range(1,len(fields)+1) :
    plt.subplot(rows,cols,i)
    data[fields[i-1]].value_counts().sort_values().plot.bar(color='#663399')  
    plt.xticks(rotation=90)
    plt.title(fields[i-1])
    
plt.tight_layout()
plt.show()

In [None]:
fig = make_subplots(rows=1, cols=2,
                    subplot_titles=("Unique values per Categorical feature", "Unique values per Numerical feature"))


temp_data=data.select_dtypes(exclude="number").nunique().sort_values()

fig.add_trace(
    go.Bar(x=temp_data.index, y=temp_data.values,marker=dict(color='#663399')),
    row=1, col=1
)



temp_data=data.select_dtypes(include="number").nunique().sort_values()

fig.add_trace(
    go.Bar(x=temp_data.index, y=temp_data.values,marker=dict(color='#D8BFD8')),
    row=1, col=2
)

#fig.update_layout(showlegend=False)
fig.show()

In [None]:
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=1000))

target=data[['income']].value_counts(normalize=True).round(decimals=3)*100

#target.rename(index={1:'Default',0:'Paid'},inplace=True)

pal, color=['#DA70D6','#9932CC'], ['#DA70D6','#9932CC']
fig=go.Figure()

fig.add_trace(go.Pie(labels=target.index, values=target, hole=.5, 
                     showlegend=True,sort=False, 
                     marker=dict(colors=color,line=dict(color=pal,width=2.5)),
                     hovertemplate = "%{label} Income: %{value:.2f}%<extra></extra>"))

fig.update_layout(template=temp, title='Target Distribution', 
                  legend=dict(traceorder='reversed',y=1.05,x=0),
                  uniformtext_minsize=15, uniformtext_mode='hide',width=700)
fig.show()

In [None]:
figsize = (12, 1.2 * len(data['workclass'].unique()))
plt.figure(figsize=figsize)
sns.violinplot(data, x='age', y='workclass', inner='box', palette='Dark2')
sns.despine(top=True, right=True, bottom=True, left=True)

In [None]:
data.plot(kind='scatter', x='education.num', y='capital.gain', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
px.histogram(data, x='workclass', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
px.histogram(data, x='education', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
df1 = data.copy()
df1['education.num']= df1['education.num'].astype(int)
df1 = df1.sort_values(by='education.num')

px.histogram(df1, x='education.num', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
px.histogram(data, x='marital.status', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
px.histogram(data, x='occupation', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
px.histogram(data, x='relationship', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
px.histogram(data, x='race', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
px.histogram(data, x='sex', color="income", barmode='group',color_discrete_sequence=['#9932CC','#663399'])

In [None]:
# Amerika
data["native.country"] = data["native.country"].replace(['United-States'], 'US')
# Avrupa
data["native.country"] = data["native.country"].replace(['Greece','Holand-Netherlands','Poland',"England","Yugoslavia",
                                                        "Germany","Italy","Ireland","Hungary","France","Scotland",
                                                        "Portugal"], 'Europe')
# Asya
data["native.country"] = data["native.country"].replace(['Vietnam','China','Taiwan',"India","Philippines","Japan",
                                                        "Hong","Cambodia","Laos","Thailand"], 'Asia')
# diğer
data["native.country"] = data["native.country"].replace(['Mexico','Trinadad&Tobago','Canada',"Puerto-Rico",
                                                        "Honduras","Cuba","Peru","Nicaragua","Dominican-Republic",
                                                        "Haiti","El-Salvador","Columbia","Guatemala","Jamaica","Ecuador",
                                                        "Outlying-US(Guam-USVI-etc)","Iran"], 'Others')



In [None]:
px.histogram(data, x='native.country', color="income", barmode='group',color_discrete_sequence=['#DA70D6','#D8BFD8'])

In [None]:
sns.histplot(x = data["marital.status"],color ="#9932CC")
plt.xticks(rotation=90)
plt.show()

In [None]:
data["marital.status"] = data["marital.status"].replace(['Divorced','Separated','Widowed'], 'Single')
data["marital.status"] = data["marital.status"].replace(['Married-civ-spouse','Married-spouse-absent',
                                                         'Married-AF-spouse'], 'Married')

In [None]:
sns.histplot(x = data["relationship"],color ="#663399")
plt.xticks(rotation=90)
plt.show()

In [None]:
data["relationship"] = data["relationship"].replace(['Not-in-family','Other-relative'], 'Separated')
data["relationship"] = data["relationship"].replace(['Husband','Wife'], 'Married')
data["relationship"] = data["relationship"].replace(['Unmarried','Own-child'], 'Single')




In [None]:
sns.histplot(x = data["race"],color ="#D8BFD8")
plt.xticks(rotation=90)
plt.show()

In [None]:
data["race"].value_counts()
data["race"] = data["race"].replace(['Asian-Pac-Islander','Amer-Indian-Eskimo','Other'], 'Other')


In [None]:
data["workclass"] = data["workclass"].replace(['Self-emp-not-inc','Local-gov',"State-gov","Self-emp-inc","Federal-gov",
                                              "Without-pay","Never-worked"], 'govermental')


In [None]:
sns.histplot(x = data["education"],color = "#800080")
plt.xticks(rotation=90)
plt.show()

In [None]:
data["education"] = data["education"].replace(['Prof-school',"Assoc-acdm","Assoc-voc"], 'high-school')
data["education"] = data["education"].replace(['Some-college','Doctorate','Bachelors',"Masters"], 'college')
data["education"] = data["education"].replace(['7th-8th','10th','11th',"1st-4th","5th-6th","12th",
                                              "9th","Preschool"], 'pre-hs')



In [None]:
px.histogram(data, x='education', color="income", barmode='group',color_discrete_sequence=['#016CC9','#DEB078'])

In [None]:
sns.histplot(x = data["occupation"])
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data["education"] = le.fit_transform(data["education"])


In [None]:
categorical_variables2 = [feature for feature in data.columns if data[feature].dtype in ['O','bool_']]


# Outlier Detection

In [None]:
for c in data[numerical_variables]:
    plt.figure()
    sns.boxplot(x = c, data= data, orient = "v")


In [None]:
data.describe(percentiles=[.10,.25,.50,.75,.90])

In [None]:
temp_columns = ["age","fnlwgt","education.num","hours.per.week"]

for each in data[temp_columns]:
    temp = 1.5
    IQR = data[each].quantile(0.85) - data[each].quantile(0.15)
    upper = data[each].quantile(0.90) + (temp*IQR)
    lower = data[each].quantile(0.10) - (temp*IQR)
    
    outlier_value_low = data[data[each]<lower].shape
    outlier_value_up = data[data[each]>upper].shape

    print("Column: ", each , ": " , "Upper limit: " , upper," lower limit: " , lower)
    print("below the limit shape: ", outlier_value_low , " Over border: " , outlier_value_up,"\n")
    
    data.loc[data[each]>upper,each] = upper
    data.loc[data[each]>lower,each] = lower
    
    print("After pressing")
    outlier_value_low = data[data[each]<lower].shape
    outlier_value_up = data[data[each]>upper].shape
    print("Column: ", each , ": " , "Upper limit: " , upper +1 ," lower limit: " , lower -1)
    print("below the limit shape: ", outlier_value_low , " Over border: " , outlier_value_up,"\n")
    print("----------------------------------------------------------------------------------\n")


In [None]:
data = pd.get_dummies(data,columns = categorical_variables2, dtype="int64", drop_first=True)


In [None]:
data.shape

In [None]:
X = data.drop(['income_>50K'], axis=1)

y = data['income_>50K']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Standart Scaler

In [None]:
rf_fe_imp = RandomForestRegressor(n_estimators=500,n_jobs=-1)
rf_fe_imp.fit(X_train,y_train)
feature_importances = pd.DataFrame(zip(X.columns,rf_fe_imp.feature_importances_),
                                   columns=['Features','Importance Percentage']).sort_values(by='Importance Percentage',ascending=False)
feature_importances

In [None]:
feature_importances.loc[feature_importances['Importance Percentage']>0.01]

In [None]:
important_features = feature_importances.loc[feature_importances['Importance Percentage']>0.01]['Features'].values
important_features

In [None]:
X_train.shape

In [None]:
from sklearn import preprocessing
stdandard_scale = preprocessing.StandardScaler()


X_train = pd.DataFrame(stdandard_scale.fit_transform(X_train), columns = X.columns)

X_test = pd.DataFrame(stdandard_scale.transform(X_test), columns = X.columns)




In [None]:


from imblearn.over_sampling import SMOTE


# Örnek olarak sınıflandırma modeli ve dengesiz bir veri kümesi kullanalım

# SMOTE uygulayarak örnekleri sentetik olarak arttıralım
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Model

In [None]:
#pip install xgboost


In [None]:
#pip install lightgbm


In [None]:
#pip install catboost

In [None]:
#rf_clf = RandomForestClassifier()
#rf_param_grid = {
   # 'n_estimators':[50,100,150],
  #  'max_depth':[8,10,12],
 #   'max_features': ['sqrt',0.5,0.7],                
#    'min_samples_split': [3,5,7],       
#}

#rf_clf_cv = GridSearchCV(rf_clf,param_grid=rf_param_grid,cv=5)








In [None]:

models = {    
    'LogisticRegression': LogisticRegression(),  
    'NaiveBayes': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    "BaggingClassifier": BaggingClassifier(DecisionTreeClassifier(splitter='random',max_leaf_nodes=16), n_estimators=500,
                                        max_samples=100, bootstrap=True, 
                                        max_features=0.7, bootstrap_features=True ,n_jobs=-1       

),
    'AdaBoostClassifier': AdaBoostClassifier(learning_rate=0.5, random_state=2),
    'LightGBMClassifier': LGBMClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoostClassifier': CatBoostClassifier(metric_period=100)
}

# Her bir model için accuracy report değerlerini al
for name, model in models.items():
    
    # Modeli eğit
    
    baslangic = datetime.datetime.now()
    model.fit(X_train_resampled, y_train_resampled)
    bitis = datetime.datetime.now()
    toplam_zaman = bitis - baslangic
    
    # Tahmin yap
    
    y_pred = model.predict(X_test)
    y_pred_tr = model.predict(X_train_resampled)
    
    # Classification report değerlerini yazdır
    print(f"\t\t$$$ {name} $$$\n")
    print("\t\tOğretilen sure: ", toplam_zaman)

    print("\t\t\t\tTest Data\n",classification_report(y_test, y_pred))
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
    print("Train Data",classification_report(y_train_resampled, y_pred_tr))
    print("******************************************************************************\n\n")

In [None]:
X_train_resampled= X_train_resampled[important_features]
X_test = X_test[important_features]
X_train.shape

In [None]:

models = {    
    'LogisticRegression': LogisticRegression(),  
    'NaiveBayes': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    "BaggingClassifier": BaggingClassifier(DecisionTreeClassifier(splitter='random',max_leaf_nodes=16), n_estimators=500,
                                        max_samples=100, bootstrap=True, 
                                        max_features=0.7, bootstrap_features=True ,n_jobs=-1       

),
    'AdaBoostClassifier': AdaBoostClassifier(learning_rate=0.5, random_state=2),
    'LightGBMClassifier': LGBMClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoostClassifier': CatBoostClassifier(metric_period=100)
}

# Her bir model için accuracy report değerlerini al
for name, model in models.items():
    
    # Modeli eğit
    
    baslangic = datetime.datetime.now()
    model.fit(X_train_resampled, y_train_resampled)
    bitis = datetime.datetime.now()
    toplam_zaman = bitis - baslangic
    
    # Tahmin yap
    
    y_pred = model.predict(X_test)
    y_pred_tr = model.predict(X_train_resampled)
    
    # Classification report değerlerini yazdır
    print(f"\t\t$$$ {name} $$$\n")
    print("\t\tOğretilen sure: ", toplam_zaman)

    print("\t\t\t\tTest Data\n",classification_report(y_test, y_pred))
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
    print("Train Data\n",classification_report(y_train_resampled, y_pred_tr))
    print("******************************************************************************\n\n")

In [None]:
#################