In [15]:
# Import the necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [16]:
##Data Understanding

In [6]:
# Load the dataset
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

# Check the first few rows of the train dataset
print(train_data.head())


          ID  age   gender               education                class  \
0  ID_TZ0000   79   Female    High school graduate                  NaN   
1  ID_TZ0001   65   Female    High school graduate                  NaN   
2  ID_TZ0002   21     Male   12th grade no diploma   Federal government   
3  ID_TZ0003    2   Female                Children                  NaN   
4  ID_TZ0004   70     Male    High school graduate                  NaN   

  education_institute                    marital_status  \
0                 NaN                           Widowed   
1                 NaN                           Widowed   
2                 NaN                     Never married   
3                 NaN                     Never married   
4                 NaN   Married-civilian spouse present   

                         race is_hispanic      employment_commitment  ...  \
0                       White   All other         Not in labor force  ...   
1                       White   All othe

In [7]:
# Display basic info about the training data
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209499 entries, 0 to 209498
Data columns (total 43 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   ID                              209499 non-null  object 
 1   age                             209499 non-null  int64  
 2   gender                          209499 non-null  object 
 3   education                       209499 non-null  object 
 4   class                           104254 non-null  object 
 5   education_institute             13302 non-null   object 
 6   marital_status                  209499 non-null  object 
 7   race                            209499 non-null  object 
 8   is_hispanic                     209499 non-null  object 
 9   employment_commitment           209499 non-null  object 
 10  unemployment_reason             6520 non-null    object 
 11  employment_stat                 209499 non-null  int64  
 12  wage_per_hour   

In [13]:
# Check for missing values
print(train_data.isnull().sum())

ID                                     0
age                                    0
gender                                 0
education                              0
class                             105245
education_institute               196197
marital_status                         0
race                                   0
is_hispanic                            0
employment_commitment                  0
unemployment_reason               202979
employment_stat                        0
wage_per_hour                          0
is_labor_union                    189420
working_week_per_year                  0
industry_code                          0
industry_code_main                     0
occupation_code                        0
occupation_code_main              105694
total_employed                         0
household_stat                         0
household_summary                      0
under_18_family                   151654
veterans_admin_questionnaire      207415
vet_benefit     

In [9]:
# Summary statistics of the numerical columns
print(train_data.describe())

                 age  employment_stat  wage_per_hour  working_week_per_year  \
count  209499.000000    209499.000000  209499.000000          209499.000000   
mean       34.518728         0.176760      55.433487              23.158850   
std        22.306738         0.555562     276.757327              24.397963   
min         0.000000         0.000000       0.000000               0.000000   
25%        15.000000         0.000000       0.000000               0.000000   
50%        33.000000         0.000000       0.000000               8.000000   
75%        50.000000         0.000000       0.000000              52.000000   
max        90.000000         2.000000    9999.000000              52.000000   

       industry_code  occupation_code  total_employed    vet_benefit  \
count  209499.000000    209499.000000   209499.000000  209499.000000   
mean       15.332398        11.321734        1.956067       1.515854   
std        18.049655        14.460839        2.365154       0.850853   


In [10]:
# Explore the target variable distribution
print(train_data['income_above_limit'].value_counts())

income_above_limit
Below limit    196501
Above limit     12998
Name: count, dtype: int64


In [24]:
##Data Preparation

In [26]:
# Handle missing values in both training and test data
train_data['class'].fillna(train_data['class'].mode()[0], inplace=True)
test_data['class'].fillna(test_data['class'].mode()[0], inplace=True)

train_data['education_institute'].fillna('No', inplace=True)
test_data['education_institute'].fillna('No', inplace=True)

In [32]:
# Encoding categorical variables
# Convert 'gender' column to string type and remove leading/trailing spaces
train_data['gender'] = train_data['gender'].astype(str).str.strip()
test_data['gender'] = test_data['gender'].astype(str).str.strip()

In [34]:
# Check if there are any categories in test_data that don't exist in train_data
train_categories = set(train_data['gender'].unique())
test_categories = set(test_data['gender'].unique())

In [37]:
# Find categories in the test data that are not in the training data
unknown_categories = test_categories - train_categories
print(f"Unknown categories in test set: {unknown_categories}")

Unknown categories in test set: {'Male', 'Female'}


In [38]:
# Replace unknown categories in test data with the most frequent category in train data
most_frequent_category = train_data['gender'].mode()[0]
test_data['gender'] = test_data['gender'].replace(list(unknown_categories), most_frequent_category)


In [42]:
# Initialize LabelEncoder
labelencoder = LabelEncoder()

# Fit and transform on the training data
train_data['gender'] = labelencoder.fit_transform(train_data['gender'])

# Transform the test data based on the training data's label encoding
test_data['gender'] = labelencoder.transform(test_data['gender'])

# Encode the target variable in the train dataset
train_data['income_above_limit'] = labelencoder.fit_transform(train_data['income_above_limit'])


In [43]:
# Select features (dropping unnecessary columns like 'ID')
X_train = train_data.drop(['income_above_limit', 'ID'], axis=1)
y_train = train_data['income_above_limit']

X_test = test_data.drop(['ID'], axis=1)

In [45]:
# Handle any other categorical columns in both train and test sets (e.g., 'education', 'marital_status')
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Ensure the train and test datasets have the same columns after encoding
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [46]:
# 3. Feature Scaling
# Initialize StandardScaler
scaler = StandardScaler()

# Scale the numerical features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
##Modeling & Evaluation

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

# Step 1: Split the training data for evaluation purposes (using a validation set)
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Step 2: Initialize the RandomForest classifier
model = RandomForestClassifier(random_state=42)

# Step 3: Train the model on the training data
model.fit(X_train_final, y_train_final)

# Step 4: Make predictions on the validation data (since we don't have y_test)
y_val_pred = model.predict(X_val)

# Step 5: Calculate the F1 score on the validation set
f1 = f1_score(y_val, y_val_pred)
print(f'F1 Score on validation set: {f1}')

# Step 6: Print classification report for the validation data
print(classification_report(y_val, y_val_pred))

# Step 7: Now, you can make predictions on the test dataset without calculating F1 (because you don't have y_test)
y_test_pred = model.predict(X_test_scaled)

# Create a submission file for test predictions
submission = pd.DataFrame({'ID': test_data['ID'], 'income_above_limit': y_test_pred})

# Save the submission file
submission.to_csv('submission.csv', index=False)


F1 Score on validation set: 0.9765710202700161
              precision    recall  f1-score   support

           0       0.77      0.41      0.54      2622
           1       0.96      0.99      0.98     39278

    accuracy                           0.96     41900
   macro avg       0.86      0.70      0.76     41900
weighted avg       0.95      0.96      0.95     41900



In [51]:
##Hyper parameter tuning

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
print(f'Best Hyperparameters: {grid_search.best_params_}')

# Train the model with the best parameters
best_rf = grid_search.best_estimator_

# Make predictions on the validation data
y_val_pred = best_rf.predict(X_val)

# Evaluate the F1 score with the best model
f1 = f1_score(y_val, y_val_pred)
print(f'Improved F1 Score on validation set: {f1}')


Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.7s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.8s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.7s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.8s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.9s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.7s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.8s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; t



[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 2.4min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 2.4min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.2min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 2.4min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 2.4min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.2min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.2min
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 1.2min
[CV] END max_depth=None,

405 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
210 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ekow/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ekow/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/ekow/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/ekow/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_con

Best Hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Improved F1 Score on validation set: 0.9919407811631551


##Deployment

In [54]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Step 1: Train the best model on the entire training dataset (X_train_scaled, y_train)
best_rf.fit(X_train_scaled, y_train)

# Step 2: Make predictions on the test dataset (X_test_scaled)
y_test_pred = best_rf.predict(X_test_scaled)

# Step 3: Convert numerical predictions back to the original labels if necessary (optional)
# Assuming 'income_above_limit' was label-encoded as 'Below limit' -> 0 and 'Above limit' -> 1
labelencoder = LabelEncoder()  # Replace with actual encoder used earlier
labelencoder.fit(['Below limit', 'Above limit'])  # Fit on original labels

# Decode predictions from numeric (0, 1) to original labels
y_test_pred_labels = labelencoder.inverse_transform(y_test_pred)

# Step 4: Prepare the submission file
submission = pd.DataFrame({'ID': test_data['ID'], 'income_above_limit': y_test_pred_labels})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

# Output a message to confirm the file was saved
print('Submission file created successfully: submission.csv')


Submission file created successfully: submission.csv
