In [527]:

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [528]:
df_train = pd.read_csv("train_data.csv")
df_test = pd.read_csv("test_data.csv")

In [529]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_train)

      PassengerId  Survived  Pclass  Sex  SibSp  Parch Ticket  \
0             329         1       3    0      1      1    346   
1              74         0       3    1      1      0    166   
2             254         0       3    1      1      0    419   
3             720         0       3    1      0      0    260   
4             667         0       2    1      0      0    104   
5              31         0       1    1      0      0    472   
6             288         0       3    1      0      0    293   
7             218         0       2    1      1      0    119   
8             798         1       3    0      0      0    311   
9             372         0       3    1      1      0    203   
10            377         1       3    0      0      0    426   
11            377         1       3    0      0      0    426   
12            377         1       3    0      0      0    426   
13            377         1       3    0      0      0    426   
14            377        

In [530]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1493 non-null   int64  
 1   Survived     1493 non-null   int64  
 2   Pclass       1493 non-null   int64  
 3   Sex          1493 non-null   int64  
 4   SibSp        1493 non-null   int64  
 5   Parch        1493 non-null   int64  
 6   Ticket       1493 non-null   object 
 7   Fare         1493 non-null   object 
 8   Cabin        1493 non-null   int64  
 9   Embarked     1493 non-null   int64  
 10  Age          1488 non-null   float64
dtypes: float64(1), int64(8), object(2)
memory usage: 128.4+ KB


In [531]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Cabin,Embarked,Age
count,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1493.0,1488.0
mean,597.697254,0.327528,2.261219,0.690556,0.184863,0.178835,5.994642,1.834561,215117.4
std,251.365652,0.469469,0.628908,0.462419,0.594308,0.592203,24.214966,0.538178,8295612.0
min,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,-32000.0
25%,377.0,0.0,2.0,0.0,0.0,0.0,-1.0,2.0,-35.0
50%,758.0,0.0,2.0,1.0,0.0,0.0,-1.0,2.0,16.0
75%,813.0,1.0,3.0,1.0,0.0,0.0,-1.0,2.0,24.0
max,891.0,1.0,3.0,1.0,5.0,6.0,133.0,2.0,320000000.0


In [532]:
(df_train["Age"] < 0).sum()

679

In [533]:
df_train.drop(df_train[df_train.Age < 0].index, inplace = True)

In [534]:
(df_train["Age"] < 0).sum()

0

In [535]:
ageNa = df_test["Age"].isna().sum()

print(f"Ammount of Nan Values in Age Series: {ageNa}")

Ammount of Nan Values in Age Series: 0


In [536]:
(df_train["Age"] > 100).sum()

6

In [537]:
df_train.drop(df_train[df_train.Age > 100].index, inplace = True)

In [538]:
df_train["PassengerId"] = df_train["PassengerId"].drop_duplicates()

In [539]:
def sjekk_df(x):
    try:
        return float(x)
    except:
        return -1 

In [540]:

df_train["Fare"] = df_train.Fare.apply(lambda x: sjekk_df(x))

In [541]:
df_train["Ticket"] = df_train["Ticket"].drop_duplicates()

In [542]:
df_train = df_train.dropna()

In [543]:
df_train["Ticket"] = df_train.Fare.apply(lambda x: sjekk_df(x))

In [544]:
df_train['Ticket'].dtypes

dtype('float64')

In [545]:
model = xgb.XGBRegressor()

In [546]:
X_train = df_train.drop("Age", axis=1)
y_train = df_train["Age"]
y_test = df_test.drop("Age", axis=1)
X_train = df_test["Age"]

In [547]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [548]:
model.fit(X_train, y_train)

In [549]:
preds = model.predict(X_test)

In [550]:
mse = mean_squared_error(preds, y_test)

In [551]:
print(np.sqrt(mse))

14.351054823483793


In [552]:
model2 = xgb.XGBRegressor()

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=250, scoring="neg_mean_squared_error", n_jobs=-1, cv=5)

# Fit the model with x and y train sets
random_search.fit(X_train, y_train)

In [553]:
model_new = random_search.best_params_
model_new

{'n_estimators': 100,
 'min_child_weight': 5,
 'max_depth': 4,
 'learning_rate': 0.05,
 'gamma': 0.0,
 'colsample_bytree': 0.5}

In [554]:
model_new = random_search.best_estimator_

In [555]:
type(model_new)

xgboost.sklearn.XGBRegressor

In [556]:
preds = model_new.predict(X_test)

In [557]:
mse_new = mean_squared_error(preds, y_test)

mse_new

179.96798527580697

In [558]:

print(f"relation between better error on the new model and the old error: {(mse_new / mse)}")

relation between better error on the new model and the old error: 0.8738313221172639
