In [1]:
# Import libraries
import pandas as pd

# Load the dataset with specified column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income (class)']
df = pd.read_csv('adult.data', names=column_names)

# View the first few rows of the dataset
print(df.head())

# Check the data types of the columns
print(df.dtypes)

# Check for missing values
print(df.isnull().sum())

# Handle missing values
df = df.fillna(0)  # Fill missing values with 0

# Handle invalid data
df = df.replace(-1, 0)  # Replace invalid values (-1) with 0

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country income (class)  
0          2174             0              40   United-States

In [2]:
# Normalize the capital-gain column
df['capital-gain'] = (df['capital-gain'] - df['capital-gain'].mean()) / df['capital-gain'].std()

# Print the mean and variance of the normalized values
print(df['capital-gain'].mean())
print(df['capital-gain'].var())

1.4838888864496425e-17
1.0000000000000004


In [3]:
# Create the bins
bins = pd.qcut(df['age'], 5)

# Print the bin counts
print(bins.value_counts())

(16.999, 26.0]    7196
(33.0, 41.0]      6763
(50.0, 90.0]      6460
(41.0, 50.0]      6175
(26.0, 33.0]      5967
Name: age, dtype: int64


In [4]:
# One-hot encode the workclass attribute
workclass_dummies = pd.get_dummies(df['workclass'])

# Add the encoded columns to the original dataframe
df = pd.concat([df, workclass_dummies], axis=1)

# Drop the original workclass column
df = df.drop('workclass', axis=1)

In [5]:
# One new attribute that could be defined based on the existing attributes is the education_level attribute.
# This attribute could be created by combining the education and education-num attributes.

# The education attribute provides the name of the highest educational degree that an individual has completed 
# (e.g., Bachelors, Masters, Doctorate), while the education-num attribute provides the number of years of 
# education that an individual has completed. 

# With this new attribute, it can provide a more detailed insight about an individual's education level. 
# For example, an individual who has completed a Bachelors degree might have a different education level than
# someone who has completed a Masters degree, even if they have completed the same number of years of education.

In [6]:
# Select only the continuous attributes
df_cont = df.select_dtypes(include=['float64', 'int64'])

# Calculate the correlations
corr_matrix = df_cont.corr()

# Print the correlations with the hours-per-week attribute
print(corr_matrix['hours-per-week'].sort_values(ascending=False))

hours-per-week    1.000000
education-num     0.148123
capital-gain      0.078409
age               0.068756
capital-loss      0.054256
fnlwgt           -0.018768
Name: hours-per-week, dtype: float64
