In [30]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [31]:
train_df = pd.read_csv('Train_Data.csv')
test_df = pd.read_csv('Test_Data.csv')

In [32]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       1954 non-null   float64
 1   RIAGENDR   1948 non-null   float64
 2   PAQ605     1953 non-null   float64
 3   BMXBMI     1948 non-null   float64
 4   LBXGLU     1953 non-null   float64
 5   DIQ010     1948 non-null   float64
 6   LBXGLT     1955 non-null   float64
 7   LBXIN      1957 non-null   float64
 8   age_group  1952 non-null   object 
dtypes: float64(8), object(1)
memory usage: 130.6+ KB
None


In [33]:
nums_col = train_df.select_dtypes(include=['int64','float64']).columns
cat_col = train_df.select_dtypes(include=['object']).columns

In [34]:
for nums in nums_col:
    if train_df[nums].isnull().sum() > 0:
        medina = train_df[nums].median()
        train_df[nums].fillna(medina, inplace = True)
        test_df[nums].fillna(medina, inplace = True)

In [36]:
for cat in cat_col:
    if train_df[cat].isnull().sum() > 0:
        mod = train_df[cat].mode()[0]
        train_df[cat].fillna(mod, inplace = True)
        test_df[cat].fillna(mod, inplace = True)

In [37]:
print("Train Missing: ", train_df.isnull().sum().sum())
print("Test Missing: ", test_df.isnull().sum().sum())

Train Missing:  0
Test Missing:  0


In [38]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
train_df['age_group'] = label.fit_transform(train_df['age_group'])

In [39]:
y = train_df['age_group']
x = train_df.drop('age_group', axis=1)
x_test = test_df.copy()

In [40]:
combined = pd.concat([x,x_test],axis=0)
combined = pd.get_dummies(combined)

x = combined.iloc[:len(x),:]
x_test = combined.iloc[len(x):,:]

In [41]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=42
)

In [42]:
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(y.unique()),
    random_state=42,
    n_estimators=300  
)

In [43]:
model.fit(
    x_train, y_train,
    eval_set=[(x_val, y_val)],
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1002
[LightGBM] [Info] Number of data points in the train set: 1572, number of used features: 8
[LightGBM] [Info] Start training from score -0.173960
[LightGBM] [Info] Start training from score -1.834651


In [44]:
y_pred = model.predict(x_val)

f1 = f1_score(y_val, y_pred, average='weighted')
print(f"LightGBM Validation F1 Score: {f1:.4f}")

LightGBM Validation F1 Score: 0.7909


In [49]:
test_preds = model.predict(x_test)

test_labels = label.inverse_transform(test_preds)

submission = pd.DataFrame({
    'ID': test_df.index, 
    'age_group': test_preds.astype(int)
})
submission.to_csv('submission_lgbm.csv', index=False)

In [50]:
print("submission_lgbm.csv saved successfully!")

submission_lgbm.csv saved successfully!
