In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

RANDOM_SEED = 42

def show_metrics(y_test, y_pred, probs):
    print('accuracy_score:\t\t {:.4}'.format(accuracy_score(y_test, y_pred)))
    print('precision_score:\t {:.4}'.format(precision_score(y_test, y_pred, zero_division=0)))
    print('recall_score:\t\t {:.4}'.format(recall_score(y_test, y_pred, zero_division=0)))
    print('f1_score:\t\t {:.4}'.format(f1_score(y_test, y_pred, zero_division=0)))
    print('roc_auc_score:\t\t {:.4}'.format(roc_auc_score(y_test, probs)))
    

gender = pd.read_csv('gender.csv')
gender.drop('Unnamed: 0', axis=1, inplace=True)

# Присоединить дублирующие столбцы к основным.

gender1 = gender[[
                'index',
                'long_hair',
                'forehead_width_cm',
                'forehead_height_cm',
                'forehead_width_mm',
                'forehead_width_conventional_units',
                'nose_wide',
                'nose_long',
                'lips_thin',
                'distance_nose_to_lip_long',
                'gender'
]]

gender2 = gender[[
                'index.1',
                'long_hair.1',
                'forehead_width_cm.1',
                'forehead_height_cm.1',
                'forehead_width_mm.1',
                'forehead_width_conventional_units.1',
                'nose_wide.1',
                'nose_long.1',
                'lips_thin.1',
                'distance_nose_to_lip_long.1',
                'gender.1'
]]

gender2.columns = [
                'index',
                'long_hair',
                'forehead_width_cm',
                'forehead_height_cm',
                'forehead_width_mm',
                'forehead_width_conventional_units',
                'nose_wide',
                'nose_long',
                'lips_thin',
                'distance_nose_to_lip_long',
                'gender'
]

frames = [gender1, gender2]
gender = pd.concat(frames, ignore_index=True)

# Столбцы 'nose_wide', 'nose_long', 'lips_thin',
#'distance_nose_to_lip_long', 'long_hair' содержат информацию
# в условных единицах. Известно, что измерения проводились
# разными людьми с разными измерительными инструментами,
#поэтому данные в столбцах имеют разную погрешность измерений.
# Необходимо уменьшить влияние погрешности путем приравнивания
# значений диапазона [0,0.5) к 0, а [0.5, 1) —- к 1

func = lambda x: 1 if x >= 0.5 else 0

for col in [
    'nose_wide', 'nose_long', 'lips_thin',
    'distance_nose_to_lip_long', 'long_hair'
]:
    gender[col] = gender[col].apply(func)
    
    
# Оставить один из дублирующих друг друга столбцов.
gender.drop(['forehead_width_mm', 'forehead_width_conventional_units'],
            axis=1, inplace=True)

gender.head().T

Unnamed: 0,0,1,2,3,4
index,0.0,1.0,2.0,3.0,4.0
long_hair,1.0,0.0,0.0,0.0,1.0
forehead_width_cm,11.8,14.0,11.8,14.4,13.5
forehead_height_cm,6.1,5.4,6.3,6.1,5.9
nose_wide,1.0,0.0,1.0,0.0,0.0
nose_long,0.0,0.0,1.0,1.0,0.0
lips_thin,1.0,1.0,1.0,1.0,0.0
distance_nose_to_lip_long,1.0,0.0,1.0,1.0,0.0
gender,,,,,


In [2]:
gender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5030 entries, 0 to 5029
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   index                      5030 non-null   int64  
 1   long_hair                  5030 non-null   int64  
 2   forehead_width_cm          5001 non-null   float64
 3   forehead_height_cm         5001 non-null   float64
 4   nose_wide                  5030 non-null   int64  
 5   nose_long                  5030 non-null   int64  
 6   lips_thin                  5030 non-null   int64  
 7   distance_nose_to_lip_long  5030 non-null   int64  
 8   gender                     4000 non-null   object 
dtypes: float64(2), int64(6), object(1)
memory usage: 353.8+ KB


In [3]:
train = gender[gender['gender'].isna() == False]
test = gender[gender['gender'].isna() == True]

mask = lambda x: 1 if x == 'Female' else 0
train['gender'] = train['gender'].apply(mask)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['gender'] = train['gender'].apply(mask)


In [4]:
train.gender.value_counts()

1    2006
0    1994
Name: gender, dtype: int64

Определите пол по остальным параметрам из выборки для первой тысячи значений и прикрепите его в CSV формате. Файл должен содержать только прогнозные значения в формате "Male"/"Female", в одном столбце без заголовка

Пороговое значение для данной задачи классификации считать равным 0.5.

Для успешного прохождения достаточно иметь 70% точность решения

In [5]:
X = train.drop(['gender'], axis=1).values
y = train['gender'].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_SEED)

gbc = GradientBoostingClassifier(learning_rate=0.01,
                                 max_depth=3,
                                 n_estimators=500)

gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_valid)
probs = gbc.predict_proba(X_valid)
probs = probs[:,1]

show_metrics(y_valid, y_pred, probs)

accuracy_score:		 0.9812
precision_score:	 0.9726
recall_score:		 0.9899
f1_score:		 0.9812
roc_auc_score:		 0.9974


In [6]:
test.dropna(subset=['forehead_height_cm'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.dropna(subset=['forehead_height_cm'], inplace=True)


In [7]:
X_test = test.drop(['gender'], axis=1).values
y_pred = gbc.predict_proba(X_test)
y_pred = y_pred[:,1]


result = pd.DataFrame(y_pred, )
classes = lambda x: 'Female' if x >= 0.5 else 'Male'
result[0] = result[0].apply(classes)
# result = result[:1000]
result

Unnamed: 0,0
0,Male
1,Female
2,Male
3,Male
4,Female
...,...
996,Female
997,Female
998,Male
999,Male


In [8]:
result.to_csv('submission.csv', index=False)

In [9]:
result.value_counts()

Female    516
Male      485
dtype: int64