In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [144]:
df = pd.read_csv('train_data.csv')
df.head()


Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [145]:
df.shape

(1966, 9)

In [146]:
from sklearn.impute import SimpleImputer
num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
cat_cols = ['RIAGENDR', 'PAQ605','DIQ010']


num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
df.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,2.0,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [147]:
df.dropna(axis=0)

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,73577.0,1.0,2.0,28.9,104.0,2.0,84.0,16.15,Adult
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult
...,...,...,...,...,...,...,...,...,...
1961,83711.0,2.0,2.0,33.5,100.0,2.0,73.0,6.53,Adult
1962,83712.0,1.0,2.0,30.0,93.0,2.0,208.0,13.02,Adult
1963,83713.0,1.0,2.0,23.7,103.0,2.0,124.0,21.41,Adult
1964,83718.0,2.0,2.0,27.4,90.0,2.0,108.0,4.99,Adult


In [148]:
map1 = {
    'Adult':0,
    'Senior':1,
}

In [149]:
df['age_group'] = df['age_group'].map(map1)
df.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,73564.0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,0.0
1,73568.0,2.0,2.0,20.3,89.0,2.0,80.0,3.85,0.0
2,73576.0,1.0,2.0,23.2,89.0,2.0,68.0,6.14,0.0
3,73577.0,1.0,2.0,28.9,104.0,2.0,84.0,16.15,0.0
4,73580.0,2.0,1.0,35.9,103.0,2.0,81.0,10.92,0.0


In [155]:
df2 = df.iloc[:,1:9]
df2.head()

Unnamed: 0,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,0.0
1,2.0,2.0,20.3,89.0,2.0,80.0,3.85,0.0
2,1.0,2.0,23.2,89.0,2.0,68.0,6.14,0.0
3,1.0,2.0,28.9,104.0,2.0,84.0,16.15,0.0
4,2.0,1.0,35.9,103.0,2.0,81.0,10.92,0.0


In [157]:
df2 = df2.dropna()
df2.isna().sum()

RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64

#### random forest classifier

In [158]:
from sklearn.ensemble import RandomForestClassifier

In [164]:
X = df2.iloc[:,0:-1]
y = df2.iloc[:,-1]

In [165]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [166]:
rfc = RandomForestClassifier(
    max_depth=7,               
    min_samples_leaf=20,       
    n_estimators=200,
    random_state=42)

In [167]:
rfc.fit(x_train,y_train)
y_pred = rfc.predict(x_test)

In [82]:
from sklearn.metrics import accuracy_score , classification_report

In [168]:
accuracy_score(y_test,y_pred)

0.8549488054607508

In [169]:
classification_report(y_test,y_pred)

'              precision    recall  f1-score   support\n\n         0.0       0.86      1.00      0.92       501\n         1.0       0.50      0.01      0.02        85\n\n    accuracy                           0.85       586\n   macro avg       0.68      0.50      0.47       586\nweighted avg       0.80      0.85      0.79       586\n'

### ADABOOST

In [86]:
from sklearn.ensemble import AdaBoostClassifier

In [224]:
abc = AdaBoostClassifier( n_estimators=200,  # Increased from default 50
    learning_rate=0.8,  # Higher than default 1.0
    random_state=42,
    )

In [225]:
abc.fit(x_train,y_train)

0,1,2
,estimator,
,n_estimators,200
,learning_rate,0.8
,algorithm,'deprecated'
,random_state,42


In [226]:
y_pred_abc = abc.predict(x_test)
accuracy_score(y_test,y_pred_abc)

0.8532423208191127

In [227]:
classification_report(y_test,y_pred_abc)

'              precision    recall  f1-score   support\n\n         0.0       0.87      0.98      0.92       501\n         1.0       0.48      0.13      0.20        85\n\n    accuracy                           0.85       586\n   macro avg       0.67      0.55      0.56       586\nweighted avg       0.81      0.85      0.82       586\n'

### adaboost classifier with logistic regression

In [92]:
from sklearn.linear_model import LogisticRegression

In [97]:

abc_log = AdaBoostClassifier(
    estimator=LogisticRegression(class_weight='balanced'),
    n_estimators=150,
    learning_rate=0.8, 
    random_state=42
)


In [173]:
abc_log.fit(x_train,y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,estimator,LogisticRegre...ht='balanced')
,n_estimators,150
,learning_rate,0.8
,algorithm,'deprecated'
,random_state,42

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,100


In [174]:
y_pred_log = abc_log.predict(x_test)
accuracy_score(y_test,y_pred_log)

0.7320819112627986

In [175]:
classification_report(y_test,y_pred_log)

'              precision    recall  f1-score   support\n\n         0.0       0.90      0.77      0.83       501\n         1.0       0.28      0.52      0.36        85\n\n    accuracy                           0.73       586\n   macro avg       0.59      0.64      0.59       586\nweighted avg       0.81      0.73      0.76       586\n'

### gradient boosting classifier

In [63]:
from sklearn.ensemble import GradientBoostingClassifier

In [101]:
gbc = GradientBoostingClassifier(n_estimators= 130)

In [176]:
gbc.fit(x_train,y_train)

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,130
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [177]:
y_pred_gbc = gbc.predict(x_test)
accuracy_score(y_test,y_pred_gbc)

0.8532423208191127

In [178]:
classification_report(y_test,y_pred_gbc)

'              precision    recall  f1-score   support\n\n         0.0       0.88      0.96      0.92       501\n         1.0       0.49      0.22      0.31        85\n\n    accuracy                           0.85       586\n   macro avg       0.68      0.59      0.61       586\nweighted avg       0.82      0.85      0.83       586\n'

In [123]:
test = pd.read_csv('test_data.csv')
test.head()

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [124]:
test.shape

(312, 8)

In [125]:
test.isna().sum()

SEQN        2
RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64

In [126]:
test = test.iloc[:,1:8]
test.head()

Unnamed: 0,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [127]:
test.isna().sum()

RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64

In [132]:
from sklearn.impute import SimpleImputer
num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
cat_cols = ['RIAGENDR', 'PAQ605','DIQ010']


num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

test[num_cols] = num_imputer.fit_transform(test[num_cols])
test[cat_cols] = cat_imputer.fit_transform(test[cat_cols])


In [133]:
test.isna().sum()

RIAGENDR    0
PAQ605      0
BMXBMI      0
LBXGLU      0
DIQ010      0
LBXGLT      0
LBXIN       0
dtype: int64

In [205]:
output = abc.predict(test).astype(int)
doutput = pd.DataFrame(output,columns=['age_group'])
doutput.reindex(doutput['age_group'])
doutput.head()

Unnamed: 0,age_group
0,0
1,0
2,0
3,0
4,0


In [206]:
doutput.to_csv('submission_file.csv',index=False)

In [207]:
doutput.value_counts()

age_group
0            307
1              5
Name: count, dtype: int64