In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('adult.csv')

print('DataFrame shape:')
print(df.shape)

print('\
Data types:')
print(df.dtypes)

print('\
Summary statistics:')
print(df.describe())


print('\
First few rows:')
print(df.head())

In [None]:
print('Number of rows with missing values:')
print(df.isin(['?']).sum())

print('\
Percentage of rows with missing values:')
print(df.isin(['?']).mean() * 100)

In [None]:
# Fill missing values with mode
df = df.replace('?', pd.np.nan)
df = df.fillna(df.mode().iloc[0])

print('Missing values after filling:')
print(df.isnull().sum())

print('\
First few rows after filling:')
print(df.head())

In [None]:

categorical_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']

le = LabelEncoder()

for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

print('Dataframe with encoded categorical variables:')
print(df.head())

print('\
Value counts for encoded categorical columns:')
for col in categorical_columns:
    print(f'\
{col}:')
    print(df[col].value_counts())

In [None]:
corr_matrix = df[categorical_columns].corr()

plt.figure(figsize=(10,8), facecolor='white')
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Encoded Categorical Variables')
plt.tight_layout()
plt

In [None]:
print('\
Target variable distribution:')
print(df['income'].value_counts(normalize=True))

In [None]:

X = df.drop('income', axis=1)
y = df['income']

plt.figure(figsize=(6,4), facecolor='white')
y.value_counts().plot(kind='bar')
plt.title('Income Category Distribution')
plt.xlabel('Income Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()