In [1]:
import numpy as np
import pandas as pd
import datetime as dt

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score as r2
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import random

import warnings
warnings.filterwarnings('ignore')

matplotlib.rcParams.update({'font.size': 14})

In [2]:
TRAIN_DATA_PATH = './data/train.csv'
TEST_DATA_PATH = './data/test.csv'

In [3]:
tr_df = pd.read_csv(TRAIN_DATA_PATH)
tt_df = pd.read_csv(TEST_DATA_PATH)

In [4]:
tr_df.sample(10).T

Unnamed: 0,589,592,205,294,595,657,245,323,311,702
PassengerId,590,593,206,295,596,658,246,324,312,703
Survived,0,0,0,0,0,0,0,1,1,0
Pclass,3,3,3,3,3,3,1,2,1,3
Name,"Murdlin, Mr. Joseph","Elsbury, Mr. William James","Strom, Miss. Telma Matilda","Mineff, Mr. Ivan","Van Impe, Mr. Jean Baptiste","Bourke, Mrs. John (Catherine)","Minahan, Dr. William Edward","Caldwell, Mrs. Albert Francis (Sylvia Mae Harb...","Ryerson, Miss. Emily Borie","Barbara, Miss. Saiide"
Sex,male,male,female,male,male,female,male,female,female,female
Age,,47,2,24,36,32,44,22,18,18
SibSp,0,0,0,0,1,1,2,1,2,0
Parch,0,0,1,0,1,1,0,1,2,1
Ticket,A./5. 3235,A/5 3902,347054,349233,345773,364849,19928,248738,PC 17608,2691
Fare,8.05,7.25,10.4625,7.8958,24.15,15.5,90,29,262.375,14.4542


In [5]:
tt_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [6]:
survived_by_embarked=tr_df.groupby(['Embarked'])['Survived'].value_counts().to_dict()

Доделать зависимость выживания от embarked и записать в класс 

In [7]:
class DataPreparation:
    def __init__(self):
        """Константы и средние значения"""
        self.med_age = None
    
    def meds_and_new_features(self, df):
        self.med_age = df['Age'].median()
        
    def prepare(self, df):
        df['PassengerId'] = df['PassengerId'].astype(str)
        
        df['Age'] = df['Age'].fillna(self.med_age)
        df['Embarked'] = df['Embarked'].fillna('S')
        
        df['Sex'] = df['Sex'].replace({'female': 1, 'male':0 })
        df['Pclass'] = df['Pclass'].replace({1: 3, 3: 1})
        return df


In [8]:
data_prepare = DataPreparation()

# train
data_prepare.meds_and_new_features(tr_df)
tr_df = data_prepare.prepare(tr_df)

# test
tt_df = data_prepare.prepare(tt_df)

In [9]:
features = ['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare']
target = ['Survived']

In [10]:
tr_df = tr_df[features + target]
tt_df = tt_df[features + ['PassengerId']]
X = tr_df[features]
y = tr_df[target]

In [11]:
tt_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,PassengerId.1
0,892,1,0,34.5,0,0,7.8292,892
1,893,1,1,47.0,1,0,7.0000,893
2,894,2,0,62.0,0,0,9.6875,894
3,895,1,0,27.0,0,0,8.6625,895
4,896,1,1,22.0,1,1,12.2875,896
5,897,1,0,14.0,0,0,9.2250,897
6,898,1,1,30.0,0,0,7.6292,898
7,899,2,0,26.0,1,1,29.0000,899
8,900,1,1,18.0,0,0,7.2292,900
9,901,1,0,21.0,2,0,24.1500,901


In [11]:
final_model = GradientBoostingRegressor(criterion='friedman_mse',
                                     max_depth=7,
                                     min_samples_leaf=50,  
                                     n_estimators=1000, 
                                     max_features='sqrt', 
                                     loss='huber', 
                                     learning_rate=0.02)
    
final_model.fit(X, y)

cv_score = cross_val_score(
    final_model,
    X,
    y,
    scoring='r2',
    cv=KFold(
            n_splits=5,
            shuffle=True,
            random_state=42
    )
)

In [12]:
print(f'R2: {round(cv_score.mean(), 3)}')

R2: 0.423


In [13]:
predictions_df = pd.DataFrame()
predictions_df['PassengerId'] = tt_df['PassengerId'].copy()

tt_df.set_index('PassengerId', inplace=True)
tt_df = tt_df[features]

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
y_pred_final = final_model.predict(data_test)

predictions_df['Price'] = y_pred_final
predictions_df.to_csv('./predictions.csv', index=False, encoding='utf-8', sep=',')