In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
url = "https://raw.githubusercontent.com/dsrscientist/dataset1/master/census_income.csv"
census_data = pd.read_csv(url)

In [40]:
# Display the first few rows of the dataset
print(census_data.head())

# Get information about the dataset
print(census_data.info())

# Summary statistics
print(census_data.describe())

# Check for missing values
print(census_data.isnull().sum())

   Age          Workclass  Fnlwgt   Education  Education_num  \
0   50   Self-emp-not-inc   83311   Bachelors             13   
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   
4   37            Private  284582     Masters             14   

        Marital_status          Occupation    Relationship    Race      Sex  \
0   Married-civ-spouse     Exec-managerial         Husband   White     Male   
1             Divorced   Handlers-cleaners   Not-in-family   White     Male   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
3   Married-civ-spouse      Prof-specialty            Wife   Black   Female   
4   Married-civ-spouse     Exec-managerial            Wife   White   Female   

   Capital_gain  Capital_loss  Hours_per_week  Native_country  Income  
0             0             0              13   United-States   <=50

In [41]:
# Handle missing values if any
# Assuming there are no missing values based on the dataset description

# Encode categorical variables if any
# If there are categorical variables, we may need to encode them using one-hot encoding or label encoding

# Perform any necessary transformations
# This could include scaling numerical features, feature engineering, etc.

In [43]:
# Explore the distribution of the target variable
# Display the columns in the dataset
print(census_data.columns)

# Explore the relationship between features and the target variable
# For example, we can use visualizations such as histograms, box plots, or bar plots

Index(['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education_num',
       'Marital_status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital_gain', 'Capital_loss', 'Hours_per_week', 'Native_country',
       'Income'],
      dtype='object')


In [50]:
# Split the data into features and target variable
X = census_data.drop('Income', axis=1)
y = census_data['Income']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Perform one-hot encoding for categorical variables
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough')

X_encoded = preprocessor.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Choose a machine learning algorithm and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7977579852579852


In [51]:
# Evaluate the model using other metrics (e.g., precision, recall, F1-score)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Fine-tune the model by adjusting hyperparameters or trying different algorithms
# Example: Grid search for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

              precision    recall  f1-score   support

       <=50K       0.80      0.97      0.88      4912
        >50K       0.74      0.27      0.40      1600

    accuracy                           0.80      6512
   macro avg       0.77      0.62      0.64      6512
weighted avg       0.79      0.80      0.76      6512



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best Parameters: {'C': 0.1}
