In [321]:

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [322]:
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

In [323]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Cabin,Embarked,Age
count,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1488.0
mean,597.697254,0.327528,2.261219,0.690556,0.184863,0.178835,5.994642,1.834561,215117.4
std,251.365652,0.469469,0.628908,0.462419,0.594308,0.592203,24.214966,0.538178,8295612.0
min,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-32000.0
25%,377.0,0.0,2.0,0.0,0.0,0.0,-1.0,2.0,-35.0
50%,758.0,0.0,2.0,1.0,0.0,0.0,-1.0,2.0,16.0
75%,813.0,1.0,3.0,1.0,0.0,0.0,-1.0,2.0,24.0
max,891.0,1.0,3.0,1.0,5.0,6.0,133.0,2.0,320000000.0


In [324]:
(df_train["Age"] < 0).sum()

679

In [325]:
df_train.drop(df_train[df_train.Age < 0].index, inplace = True)

In [326]:
(df_train["Age"] < 0).sum()

0

In [327]:
ageNa = df_test["Age"].isna().sum()

print(f"Ammount of Nan Values in Age Series: {ageNa}")

Ammount of Nan Values in Age Series: 0


In [328]:
(df_train["Age"] > 100).sum()

6

In [329]:
df_train.drop(df_train[df_train.Age > 100].index, inplace = True)

In [330]:
df_train["PassengerId"] = df_train["PassengerId"].drop_duplicates()

In [331]:
def sjekk_df(x):
    try:
        return float(x)
    except:
        return -1 

In [332]:

df_train["Fare"] = df_train.Fare.apply(lambda x: sjekk_df(x))

In [333]:
df_train["Ticket"] = df_train["Ticket"].drop_duplicates()

In [334]:
df_train = df_train.dropna()

In [335]:
df_train["Ticket"] = df_train.Fare.apply(lambda x: sjekk_df(x))

In [336]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 436 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  436 non-null    float64
 1   Survived     436 non-null    int64  
 2   Pclass       436 non-null    int64  
 3   Sex          436 non-null    int64  
 4   SibSp        436 non-null    int64  
 5   Parch        436 non-null    int64  
 6   Ticket       436 non-null    float64
 7   Fare         436 non-null    float64
 8   Cabin        436 non-null    int64  
 9   Embarked     436 non-null    int64  
 10  Age          436 non-null    float64
dtypes: float64(4), int64(7)
memory usage: 40.9 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  143 non-null    int64  
 1   Survived     143 non-null    int64  
 2   Pclass       143 no

In [337]:
model = xgb.XGBRegressor()

In [338]:
X_train = df_train.drop("Age", axis=1)
y_train = df_train["Age"]

In [339]:
X_train, X_train_test, y_train, y_train_Test = train_test_split(X_train, y_train, train_size=0.8, random_state=42)

In [340]:
model.fit(X_train, y_train)

In [341]:
preds = model.predict(X_train_test)

In [342]:
mse = mean_squared_error(preds, y_train_Test)
mse_root = np.sqrt(mse)

In [343]:
print(mse_root)

14.351054823483793


In [344]:
params={
    "learning_rate":[0.05, 0.10, 0.15,0.20, 0.25, 0.30],
    "max_depth":[3,4,5,6,8,10,12,15],
    "min_child_weight":[1,3,5,7],
    "gamma":[0.0,0.1,0.2,0.3,0.4],
    "colsample_bytree":[0.3,0.4,0.5,0.7],
    "n_estimators":[100,200,300,400,500,900,1100,1500]

}

In [345]:
model2 = xgb.XGBRegressor()

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=250, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

# Fit the model with x and y train sets
random_search.fit(X_train, y_train)

In [346]:
model_new = random_search.best_params_
model_new

{'n_estimators': 100,
 'min_child_weight': 3,
 'max_depth': 3,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.5}

In [347]:
model_new = random_search.best_estimator_

In [348]:
preds = model_new.predict(X_train_test)

In [349]:
mse_new = mean_squared_error(preds, y_train_Test)
mse_root_new = np.sqrt(mse_new)
print(mse_root_new)

13.638754655285892


In [350]:
X_test = df_test.drop("Age", axis=1)
Y_test = df_test["Age"]

In [351]:

preds_test = model_new.predict(X_test)

In [355]:
Y_test.info()

<class 'pandas.core.series.Series'>
RangeIndex: 143 entries, 0 to 142
Series name: Age
Non-Null Count  Dtype  
--------------  -----  
143 non-null    float64
dtypes: float64(1)
memory usage: 1.2 KB


In [356]:
mse_test=mean_squared_error(preds_test,Y_test)

In [357]:

mse_test_root = np.sqrt(mse_test)
print(mse_test_root)


11.680836766631199


In [359]:
print(f'relation between better error on the new model and the old error: {mse_test_root/mse_root_new}')

relation between better error on the new model and the old error: 0.8564445260479941
