# 1. Обзор структуры данных

In [29]:
import pandas as pd

column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

df = pd.read_csv("adult.data.csv", header=None, names=column_names, na_values=" ?", skipinitialspace=True)

In [30]:
print(df.info())
print(df.describe(include="all"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None
                 age workclass        fnlwgt education  education-num  \
count   32561.0

# 2. Обработка пропущенных значений

In [31]:
for column in ["workclass", "occupation", "native-country"]:
    mode_value = df[column].mode()[0]
    df[column] = df[column].fillna(mode_value)

# 3. Обнаружение и удаление выбросов

In [33]:
for column in ["age", "hours-per-week"]:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]


# 4. Масштабирование числовых признаков

In [34]:
for column in ["age", "education-num", "hours-per-week"]:
    mean = df[column].mean()
    std = df[column].std()
    df[column] = (df[column] - mean) / std

# 5. Кодирование категориальных признаков

In [35]:
education_order = {
    "Preschool": 0,
    "1st-4th": 1,
    "5th-6th": 2,
    "7th-8th": 3,
    "9th": 4,
    "10th": 5,
    "11th": 6,
    "12th": 7,
    "HS-grad": 8,
    "Some-college": 9,
    "Assoc-voc": 10,
    "Assoc-acdm": 11,
    "Bachelors": 12,
    "Masters": 13,
    "Prof-school": 14,
    "Doctorate": 15
}
df["education"] = df["education"].map(education_order)

In [36]:
df = pd.get_dummies(df, columns=["workclass"], drop_first=True)

# 6. Финальный обзор данных

In [28]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 23499 entries, 0 to 32560
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         23499 non-null  float64
 1   fnlwgt                      23499 non-null  int64  
 2   education                   0 non-null      float64
 3   education-num               23499 non-null  float64
 4   marital-status              23499 non-null  object 
 5   occupation                  23499 non-null  object 
 6   relationship                23499 non-null  object 
 7   race                        23499 non-null  object 
 8   sex                         23499 non-null  object 
 9   capital-gain                23499 non-null  int64  
 10  capital-loss                23499 non-null  int64  
 11  hours-per-week              23499 non-null  float64
 12  native-country              23499 non-null  object 
 13  income                      23499 no