# Explore here

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [6]:
# --- Always reload the dataset to ensure a clean start for preprocessing ---
from numpy import shape


try:
    df = pd.read_csv('adult-census-income.csv')
except FileNotFoundError:
    df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/predicting-your-future-with-data/main/adult-census-income.csv')
df


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [7]:
# --- 2. Handle '?' as Missing Values ---
# As observed, '?' is used to denote missing values. Replace them with numpy's NaN.
df = df.replace('?', np.nan)
print("\nReplaced '?' with NaN. Checking for nulls after replacement:")
print(df.isnull().sum())


Replaced '?' with NaN. Checking for nulls after replacement:
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64


In [8]:
# 1. Identify and handle '?' as missing values
df = df.replace('?', np.nan)

# 2. Drop rows with missing values (a common approach for this dataset if missing data is not extensive)
df.dropna(inplace=True)

# Separate features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0) # Convert target to binary (1 for >50K, 0 for <=50K)

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler() # or MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Example: Apply preprocessing to a sample of X to see the transformed data
# (This is just for demonstration; the full preprocessing will happen within the model pipeline)
# X_preprocessed = preprocessor.fit_transform(X)
# print("\nShape of preprocessed data:", X_preprocessed.shape)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nData splitting complete. Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")


Data splitting complete. Shapes:
X_train: (24129, 14), y_train: (24129,)
X_test: (6033, 14), y_test: (6033,)


In [9]:
print("\nCleaned column names:")
print(df.columns)


Cleaned column names:
Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')


In [10]:
# --- 3. Handle Missing Values (Imputation/Dropping) ---
initial_rows = df.shape[0]
df.dropna(inplace=True)
rows_after_drop = df.shape[0]
print(f"\nDropped {initial_rows - rows_after_drop} rows with missing values.")
print(f"Dataset shape after dropping NaNs: {df.shape}")


# --- 4. Separate Features (X) and Target (y) ---
# Convert the 'income' target variable to a numerical format (0 and 1)
# 1 for >50K, 0 for <=50K
X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x == '>50K' else 0) # No need for .strip() here if handled globally above

print("\nTarget variable 'income' converted to binary (1 for >50K, 0 for <=50K).")
print("Value counts for y:")
print(y.value_counts())

# --- 5. Identify Numerical and Categorical Features ---
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

print(f"\nNumerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

# --- 6. Create Preprocessing Pipelines ---
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

print("\nPreprocessing pipelines defined using StandardScaler and OneHotEncoder.")

# --- 7. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nData splitting complete. Shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")


Dropped 0 rows with missing values.
Dataset shape after dropping NaNs: (30162, 15)

Target variable 'income' converted to binary (1 for >50K, 0 for <=50K).
Value counts for y:
income
0    22654
1     7508
Name: count, dtype: int64

Numerical Features: ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
Categorical Features: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

Preprocessing pipelines defined using StandardScaler and OneHotEncoder.

Data splitting complete. Shapes:
X_train: (24129, 14), y_train: (24129,)
X_test: (6033, 14), y_test: (6033,)


*Defining the Recommendation Problem

The core recommendation is strategies or actionable changes to an individual's socioeconomic profile that are predicted to increase their likelihood of earning more than $50,000 per year.

*Who is the "user" in this case?

The "user" is an adult individual for whom we want to predict income and offer personalized strategies.





In [11]:
# --- 2. Data Preprocessing (Re-applying the refined steps) ---
# Clean Column Names: Replace '.' with '_' and strip spaces
original_columns = df.columns.tolist()
df.columns = [col.strip().replace('.', '_') for col in original_columns]

# Strip spaces from all object columns and replace '?' with NaN
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].apply(lambda x: x.strip() if isinstance(x, str) else x)
df = df.replace('?', np.nan)

# Drop rows with any missing values
df.dropna(inplace=True)

# Separate features (X) and target (y)
X = df.drop('income', axis=1)
y = df['income'].apply(lambda x: 1 if x == '>50K' else 0) # 1 for >50K, 0 for <=50K

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

# Define preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nData preprocessing complete for model training.")


Data preprocessing complete for model training.


In [14]:
# --- 3. Build and Train the Supervised Classification Model ---
# We'll use a RandomForestClassifier as it generally performs well and provides feature importances.
# class_weight='balanced' helps handle the imbalance in income classes if present.
model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))])

print("\nTraining the RandomForestClassifier model...")
model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate the model
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\n--- Model Evaluation Results (RandomForestClassifier) ---")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Training the RandomForestClassifier model...
Model training complete.

--- Model Evaluation Results (RandomForestClassifier) ---
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4531
           1       0.77      0.63      0.69      1502

    accuracy                           0.86      6033
   macro avg       0.83      0.78      0.80      6033
weighted avg       0.86      0.86      0.86      6033

ROC AUC Score: 0.9111225494676265
Confusion Matrix:
 [[4249  282]
 [ 557  945]]


In [15]:
# --- 4. Implement the Content-Based Recommendation Logic ---

def get_recommendations(user_profile_df, model, top_n=3, min_prob_increase=0.05):
    """
    Generates recommendations for a user based on potential changes to their profile,
    aiming to increase the probability of earning >50K.

    Args:
        user_profile_df (pd.DataFrame): A DataFrame with a single row representing the user's current profile.
                                        It must have all original feature columns.
        model: The trained scikit-learn pipeline model (with preprocessor and classifier).
        top_n (int): Number of top recommendations to provide.
        min_prob_increase (float): Minimum probability increase required for a recommendation to be considered.

    Returns:
        list: A list of dictionaries, each representing a recommended change and its predicted impact.
    """
    # Ensure the user profile has the same columns as the training data
    # This is crucial because the preprocessor expects all original columns.
    # If a user profile is missing columns, fill them with a sensible default (e.g., mode from training data)
    # For simplicity, we assume the input user_profile_df already has all columns.
    # In a real application, you'd want robust handling for this.

    # Predict initial probability for the user
    initial_prediction_prob = model.predict_proba(user_profile_df)[:, 1][0]
    print(f"\nUser's initial predicted probability of earning >50K: {initial_prediction_prob:.2f}")

    recommendations = []
    base_profile = user_profile_df.iloc[0].copy()

    # Define mutable features and their possible 'improved' values.
    # These values are chosen based on common sense and potential for higher income.
    mutable_features_options = {
        'education': ['Bachelors', 'Masters', 'Doctorate', 'Prof-school', 'Assoc-voc', 'Assoc-acdm'],
        'occupation': ['Exec-managerial', 'Prof-specialty', 'Sales', 'Tech-support'],
        'hours_per_week': [40, 50, 60], # Standard full-time to overtime
        'workclass': ['Self-emp-inc', 'Federal-gov', 'State-gov']
    }

    # Iterate through each mutable feature
    for feature, potential_values in mutable_features_options.items():
        original_value = base_profile[feature]

        # For numerical features like 'hours_per_week', only consider values greater than current
        if feature == 'hours_per_week':
            current_hours = original_value
            values_to_try = [v for v in potential_values if v > current_hours]
        else: # For categorical features, try all potential values that are different from current
            values_to_try = [v for v in potential_values if v != original_value]

        for value in values_to_try:
            temp_profile = base_profile.copy()
            temp_profile[feature] = value
            temp_profile_df = pd.DataFrame([temp_profile])

            # Predict with the modified profile
            new_prediction_prob = model.predict_proba(temp_profile_df)[:, 1][0]

            prob_increase = new_prediction_prob - initial_prediction_prob

            # Only add as a recommendation if it significantly increases the probability
            if prob_increase > min_prob_increase:
                recommendations.append({
                    'feature': feature,
                    'original_value': original_value,
                    'recommended_value': value,
                    'predicted_prob_increase': prob_increase,
                    'new_predicted_prob': new_prediction_prob
                })

    # Sort recommendations by the increase in predicted probability (descending)
    recommendations.sort(key=lambda x: x['predicted_prob_increase'], reverse=True)

    return recommendations[:top_n]

In [16]:
# --- 5. Test with Simulated Cases (as per project instructions) ---

print("\n--- Testing Recommendation System with Simulated User Profiles ---")

# Example 1: A young, less educated user working part-time
user_profile_1_data = {
    'age': 25,
    'workclass': 'Private',
    'fnlwgt': 226802,
    'education': 'HS-grad',
    'education_num': 9,
    'marital_status': 'Never-married',
    'occupation': 'Sales',
    'relationship': 'Own-child',
    'race': 'White',
    'sex': 'Male',
    'capital_gain': 0,
    'capital_loss': 0,
    'hours_per_week': 25,
    'native_country': 'United-States'
}
user_profile_1_df = pd.DataFrame([user_profile_1_data])

print("\n--- Recommendations for User 1 (25-year-old, HS-grad, Sales, 25hrs/week) ---")
recommendations_1 = get_recommendations(user_profile_1_df, model)
if recommendations_1:
    for i, rec in enumerate(recommendations_1):
        print(f"{i+1}. To increase income likelihood, consider changing '{rec['feature']}' from '{rec['original_value']}' to '{rec['recommended_value']}'.")
        print(f"   (Predicted probability of >50K would increase by {rec['predicted_prob_increase']:.2f} to {rec['new_predicted_prob']:.2f})")
else:
    print("No significant recommendations found for this user profile.")


# Example 2: A mid-career professional, full-time, but with a mid-level education
user_profile_2_data = {
    'age': 40,
    'workclass': 'Private',
    'fnlwgt': 180000,
    'education': 'Some-college',
    'education_num': 10,
    'marital_status': 'Married-civ-spouse',
    'occupation': 'Craft-repair',
    'relationship': 'Husband',
    'race': 'White',
    'sex': 'Male',
    'capital_gain': 0,
    'capital_loss': 0,
    'hours_per_week': 40,
    'native_country': 'United-States'
}
user_profile_2_df = pd.DataFrame([user_profile_2_data])

print("\n--- Recommendations for User 2 (40-year-old, Some-college, Craft-repair, 40hrs/week) ---")
recommendations_2 = get_recommendations(user_profile_2_df, model)
if recommendations_2:
    for i, rec in enumerate(recommendations_2):
        print(f"{i+1}. To increase income likelihood, consider changing '{rec['feature']}' from '{rec['original_value']}' to '{rec['recommended_value']}'.")
        print(f"   (Predicted probability of >50K would increase by {rec['predicted_prob_increase']:.2f} to {rec['new_predicted_prob']:.2f})")
else:
    print("No significant recommendations found for this user profile.")


# Example 3: A user already with a high education, but maybe in a lower-paying occupation
user_profile_3_data = {
    'age': 35,
    'workclass': 'Private',
    'fnlwgt': 200000,
    'education': 'Bachelors',
    'education_num': 13,
    'marital_status': 'Never-married',
    'occupation': 'Adm-clerical',
    'relationship': 'Not-in-family',
    'race': 'Asian-Pac-Islander',
    'sex': 'Female',
    'capital_gain': 0,
    'capital_loss': 0,
    'hours_per_week': 40,
    'native_country': 'India'
}
user_profile_3_df = pd.DataFrame([user_profile_3_data])

print("\n--- Recommendations for User 3 (35-year-old, Bachelors, Adm-clerical, 40hrs/week) ---")
recommendations_3 = get_recommendations(user_profile_3_df, model)
if recommendations_3:
    for i, rec in enumerate(recommendations_3):
        print(f"{i+1}. To increase income likelihood, consider changing '{rec['feature']}' from '{rec['original_value']}' to '{rec['recommended_value']}'.")
        print(f"   (Predicted probability of >50K would increase by {rec['predicted_prob_increase']:.2f} to {rec['new_predicted_prob']:.2f})")
else:
    print("No significant recommendations found for this user profile. This might happen if the initial prediction is already high, or no simple changes yield improvement.")




--- Testing Recommendation System with Simulated User Profiles ---

--- Recommendations for User 1 (25-year-old, HS-grad, Sales, 25hrs/week) ---

User's initial predicted probability of earning >50K: 0.01
No significant recommendations found for this user profile.

--- Recommendations for User 2 (40-year-old, Some-college, Craft-repair, 40hrs/week) ---

User's initial predicted probability of earning >50K: 0.31
1. To increase income likelihood, consider changing 'occupation' from 'Craft-repair' to 'Tech-support'.
   (Predicted probability of >50K would increase by 0.52 to 0.83)
2. To increase income likelihood, consider changing 'occupation' from 'Craft-repair' to 'Exec-managerial'.
   (Predicted probability of >50K would increase by 0.35 to 0.66)
3. To increase income likelihood, consider changing 'occupation' from 'Craft-repair' to 'Sales'.
   (Predicted probability of >50K would increase by 0.34 to 0.65)

--- Recommendations for User 3 (35-year-old, Bachelors, Adm-clerical, 40hrs/w