In [35]:
import pandas as pd
from sklearn.metrics import precision_score

In [26]:
df = pd.read_csv('census-income .csv')

In [27]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'annual_income'],
      dtype='object')

In [6]:
occupation = df['occupation']
num_occupations = occupation.nunique()
print("Number of types of occupations:", num_occupations)

Number of types of occupations: 15


In [11]:

# Count the number of people who are working as tech support and have an annual income greater than 50k
count = len(df[(df['occupation'] == 'Tech-support') & (df['annual_income'] == '>50K')])

print(f"The number of people working as tech support and have an annual income greater than 50k is {count}.")

The number of people working as tech support and have an annual income greater than 50k is 283.


In [14]:
missing_values = df.isnull().sum().sum()
print("Total number of missing values in the dataset:", missing_values)

Total number of missing values in the dataset: 0


In [18]:
num_people = len(df[(df['workclass'] == 'Private') & (df['native-country'] != 'United-States')])

print("Number of people:", num_people)

Number of people: 2561


In [20]:
count = ((df['hours-per-week'] >= 40) | (df['annual_income'] == ' <=50K')).sum()

print("Number of people who meet the conditions:", count)

Number of people who meet the conditions: 24798


In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# load the dataset
df = pd.read_csv('census-income .csv')
print(df.columns)
# rename the last column
df = df.rename(columns={"50000+": "Annual Income"})

# remove missing values
df = df.dropna()

# convert categorical data to numerical data
categorical_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# split the data into train and test sets
X = df.drop(columns=["annual_income"])
y = df["annual_income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# build a logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'annual_income'],
      dtype='object')


LogisticRegression()

In [30]:
from sklearn.metrics import accuracy_score

# fit the model on train data
lr.fit(X_train, y_train)

# make predictions on test data
y_pred = lr.predict(X_test)

# calculate accuracy score
acc_score = accuracy_score(y_test, y_pred)

print("Accuracy score:", acc_score)

Accuracy score: 0.80090080868052


In [31]:
df = df.rename(columns={14: 'Annual Income'})

# remove missing values
df = df.replace(' ?', pd.np.nan)
df = df.dropna()

# encode categorical data
categorical_cols = [1, 3, 5, 6, 7, 8, 9, 13]
for col in categorical_cols:
    le = LabelEncoder()
    df.iloc[:, col] = le.fit_transform(df.iloc[:, col])

# split the data into train and test sets
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# build the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# calculate the accuracy score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

  df = df.replace(' ?', pd.np.nan)


Accuracy: 0.80090080868052


In [33]:
from sklearn.metrics import confusion_matrix

# predict using the test data
y_pred = model.predict(X_test)

# generate confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)

# calculate specificity
specificity = conf_mat[0, 0] / (conf_mat[0, 0] + conf_mat[0, 1])
print("Specificity:", specificity)

Specificity: 0.9712434183880113


In [37]:
from sklearn.metrics import precision_score

y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred, pos_label='<=50K')

print("Precision (when target is <=50K):", precision)

Precision (when target is <=50K): 0.8059601165135559


In [38]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       <=50K       0.81      0.97      0.88      7407
        >50K       0.75      0.27      0.39      2362

    accuracy                           0.80      9769
   macro avg       0.78      0.62      0.64      9769
weighted avg       0.79      0.80      0.76      9769



In [39]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred, pos_label='>50K')

print("F1 score (when target is True):", f1)

F1 score (when target is True): 0.39313572542901715


In [40]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
n_correct = int(accuracy * len(y_test))
print("Number of records correctly classified:", n_correct)

Number of records correctly classified: 7824
