In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [72]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_test["Survived"] = False
raw_data = pd.concat([df_train, df_test], sort=False)
raw_data = raw_data.drop(columns=["PassengerId", "Name"], axis=1)

In [73]:
raw_data.shape[0] == df_train.shape[0] + df_test.shape[0]

True

In [74]:
raw_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,female,35.0,1,0,113803,53.1000,C123,S
4,0,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
413,0,3,male,,0,0,A.5. 3236,8.0500,,S
414,0,1,female,39.0,0,0,PC 17758,108.9000,C105,C
415,0,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,0,3,male,,0,0,359309,8.0500,,S


## Поис NaN в датасете

In [75]:
raw_data.isna().sum()

Survived       0
Pclass         0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [76]:
raw_data.Cabin.isna().value_counts()

Cabin
True     1014
False     295
Name: count, dtype: int64

In [77]:
100 - ((raw_data.Cabin.isna().value_counts()[1] / raw_data.Cabin.isna().value_counts()[0]) * 100)

  100 - ((raw_data.Cabin.isna().value_counts()[1] / raw_data.Cabin.isna().value_counts()[0]) * 100)


70.90729783037476

In [78]:
clean_data = raw_data.drop("Cabin", axis=1)

In [79]:
median_age = clean_data.Age.median()
clean_data.Age = clean_data.Age.fillna(median_age)

In [80]:
clean_data.Embarked = clean_data.Embarked.fillna("U")

In [81]:
clean_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.2500,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,S
3,1,1,female,35.0,1,0,113803,53.1000,S
4,0,3,male,35.0,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...
413,0,3,male,28.0,0,0,A.5. 3236,8.0500,S
414,0,1,female,39.0,0,0,PC 17758,108.9000,C
415,0,3,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S
416,0,3,male,28.0,0,0,359309,8.0500,S


In [82]:
gender_columns = pd.get_dummies(clean_data.Sex, prefix="Sex")
embarked_columns = pd.get_dummies(clean_data.Embarked, prefix="Embarked")

clean_data = pd.concat([clean_data, gender_columns], axis=1)
clean_data = pd.concat([clean_data, embarked_columns], axis=1)

clean_data = clean_data.drop(["Sex", "Embarked"], axis=1)
clean_data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U
0,0,3,22.0,1,0,A/5 21171,7.2500,False,True,False,False,True,False
1,1,1,38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False
2,1,3,26.0,0,0,STON/O2. 3101282,7.9250,True,False,False,False,True,False
3,1,1,35.0,1,0,113803,53.1000,True,False,False,False,True,False
4,0,3,35.0,0,0,373450,8.0500,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,3,28.0,0,0,A.5. 3236,8.0500,False,True,False,False,True,False
414,0,1,39.0,0,0,PC 17758,108.9000,True,False,True,False,False,False
415,0,3,38.5,0,0,SOTON/O.Q. 3101262,7.2500,False,True,False,False,True,False
416,0,3,28.0,0,0,359309,8.0500,False,True,False,False,True,False


In [83]:
pclass_columns = pd.get_dummies(clean_data["Pclass"], prefix="Pclass")

In [84]:
clean_data = pd.concat([clean_data, pclass_columns], axis=1)
clean_data = clean_data.drop("Pclass", axis=1)

In [85]:
clean_data

Unnamed: 0,Survived,Age,SibSp,Parch,Ticket,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,Pclass_1,Pclass_2,Pclass_3
0,0,22.0,1,0,A/5 21171,7.2500,False,True,False,False,True,False,False,False,True
1,1,38.0,1,0,PC 17599,71.2833,True,False,True,False,False,False,True,False,False
2,1,26.0,0,0,STON/O2. 3101282,7.9250,True,False,False,False,True,False,False,False,True
3,1,35.0,1,0,113803,53.1000,True,False,False,False,True,False,True,False,False
4,0,35.0,0,0,373450,8.0500,False,True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,28.0,0,0,A.5. 3236,8.0500,False,True,False,False,True,False,False,False,True
414,0,39.0,0,0,PC 17758,108.9000,True,False,True,False,False,False,True,False,False
415,0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,False,True,False,False,True,False,False,False,True
416,0,28.0,0,0,359309,8.0500,False,True,False,False,True,False,False,False,True


In [86]:
bins = [x for x in range(0, 81, 10)]
categorized_age = pd.cut(clean_data.Age, bins)
clean_data["Categorized_age"] = categorized_age
clean_data = clean_data.drop(["Age"], axis=1)

In [87]:
cagegorized_age_columns = pd.get_dummies(clean_data['Categorized_age'], prefix='Categorized_age')
clean_data = pd.concat([clean_data, cagegorized_age_columns], axis=1)
clean_data = clean_data.drop(['Categorized_age'], axis=1)

In [88]:
clean_data = clean_data.drop(["Ticket"], axis=1)

In [89]:
clean_data

Unnamed: 0,Survived,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_U,...,Pclass_2,Pclass_3,"Categorized_age_(0, 10]","Categorized_age_(10, 20]","Categorized_age_(20, 30]","Categorized_age_(30, 40]","Categorized_age_(40, 50]","Categorized_age_(50, 60]","Categorized_age_(60, 70]","Categorized_age_(70, 80]"
0,0,1,0,7.2500,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
1,1,1,0,71.2833,True,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,0,0,7.9250,True,False,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
3,1,1,0,53.1000,True,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,0,0,0,8.0500,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0,0,0,8.0500,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False
414,0,0,0,108.9000,True,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,False
415,0,0,0,7.2500,False,True,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False
416,0,0,0,8.0500,False,True,False,False,True,False,...,False,True,False,False,True,False,False,False,False,False


In [90]:
clean_data["Fare"] = clean_data["Fare"].fillna(clean_data["Fare"].median())

In [91]:
clean_data.isna().sum()

Survived                    0
SibSp                       0
Parch                       0
Fare                        0
Sex_female                  0
Sex_male                    0
Embarked_C                  0
Embarked_Q                  0
Embarked_S                  0
Embarked_U                  0
Pclass_1                    0
Pclass_2                    0
Pclass_3                    0
Categorized_age_(0, 10]     0
Categorized_age_(10, 20]    0
Categorized_age_(20, 30]    0
Categorized_age_(30, 40]    0
Categorized_age_(40, 50]    0
Categorized_age_(50, 60]    0
Categorized_age_(60, 70]    0
Categorized_age_(70, 80]    0
dtype: int64

In [92]:
clean_data.shape

(1309, 21)

In [93]:
df_train, df_test = clean_data[:df_train.shape[0]], clean_data[df_train.shape[0]:]
df_test = df_test.drop(columns="Survived")
df_train.shape, df_test.shape

((891, 21), (418, 20))

In [94]:
X = df_train.drop(columns="Survived")
y = df_train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models training

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [96]:
dt_model = DecisionTreeClassifier()
nb_model = GaussianNB()
svm_model = SVC()
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()
ab_model = AdaBoostClassifier()
models = [lr_model, dt_model, nb_model, svm_model, rf_model, gb_model, ab_model]

In [97]:
accuracies = []
for model in models:
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [98]:
accuracies

[0.7932960893854749,
 0.8100558659217877,
 0.7318435754189944,
 0.659217877094972,
 0.8212290502793296,
 0.8044692737430168,
 0.7932960893854749]

In [99]:
df_dummy = pd.read_csv("test.csv")
pred = rf_model.predict(df_test)

final = pd.DataFrame()
final["PassengerId"] = df_dummy["PassengerId"]
final["Survived"] = pred

final.to_csv("wilden_submission.csv", index=False)