In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Read the data from a CSV file (replace 'path_to_file' with the actual file path)
data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset1/master/census_income.csv')

# Preprocess the data
# Remove unnecessary columns (if any)
data = data.drop(['Fnlwgt'], axis=1)

# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data)

# Separate the features and target variable
X = data.drop(['Income_ >50K'], axis=1)
y = data['Income_ >50K']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Example prediction for a new data point
new_data = pd.DataFrame({
    'age': [35],
    'education_num': [8],
    'capital_gain': [0],
    'capital_loss': [0],
    'hours_per_week': [40],
    'workclass_ Private': [1],
    'workclass_ Self-emp-not-inc': [0],
    'workclass_ Local-gov': [0],
    'workclass_ ?': [0],
    'marital_status_ Married-civ-spouse': [1],
    'marital_status_ Never-married': [0],
    'marital_status_ Divorced': [0],
    'occupation_ Exec-managerial': [0],
    'occupation_ Prof-specialty': [1],
    'occupation_ Other-service': [0],
    'occupation_ ?': [0],
    'relationship_ Husband': [1],
    'relationship_ Not-in-family': [0],
    'relationship_ Own-child': [0],
    'relationship_ Unmarried': [0],
    'race_ White': [1],
    'race_ Black': [0],
    'race_ Asian-Pac-Islander': [0],
    'race_ Amer-Indian-Eskimo': [0],
    'sex_ Male': [1],
    'sex_ Female': [0],
    'native_country_ United-States': [1],
    'native_country_ Mexico': [0],
    'native_country_ ?': [0]
})

# Ensure the new_data columns match the training data columns
new_data = new_data.reindex(columns=X.columns, fill_value=0)

prediction = model.predict(new_data)
print(f"Prediction: {'>50K' if prediction[0] else '<=50K'}")


Accuracy: 1.0
Prediction: >50K
