In [473]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

In [474]:
df = pd.read_csv("../data/train.csv",index_col="PassengerId")

In [475]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [476]:
np.unique(np.array(df["Pclass"]))

array([1, 2, 3])

In [477]:
df.drop(["Ticket","Cabin","Name"],axis=1,inplace=True)

In [478]:
np.array(df.iloc[1]).tolist()

[1, 1, 'female', 38.0, 1, 0, 71.2833, 'C']

In [479]:
df.columns.tolist()

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [480]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [481]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [482]:
def missing_values_mode(df):
    mode_emb = df["Embarked"].mode()[0]
    mode_age = int(df["Age"].mode()[0])
    joblib.dump(mode_emb,"../model/embarked_mode")
    joblib.dump(mode_age,"../model/age_mode")

In [483]:
def missing_value_transform(df):
    model_emb = joblib.load("../model/embarked_mode")
    model_age = joblib.load("../model/age_mode")
    df["Age"] = df["Age"].fillna(model_age)
    df["Embarked"] = df["Embarked"].fillna(model_emb)
    return df

In [484]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [485]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


In [486]:
def fit_gender(df):
    enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
    enc.fit(df[["Sex"]])
    joblib.dump(enc,"../model/gender_enc")

In [487]:
def transform_gender(df):
    enc = joblib.load("../model/gender_enc")
    result = enc.transform(df[["Sex"]])
    df[enc.get_feature_names_out().tolist()] = result
    df.drop("Sex",axis=1,inplace=True)
    return df

In [488]:
def fit_emb(df):
    enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
    enc.fit(df[["Embarked"]])
    joblib.dump(enc,"../model/emb_enc")

In [489]:
def transform_emb(df):
    enc = joblib.load("../model/emb_enc")
    result = enc.transform(df[["Embarked"]])
    df[enc.get_feature_names_out().tolist()] = result
    df.drop("Embarked",axis=1,inplace=True)
    return df

In [490]:
def min_max_scaler_fit(df):
    col = df.columns
    scaler=MinMaxScaler()
    scaler.fit(df[col[1:]])
    joblib.dump(scaler,"../model/scaler")
    

In [491]:
def min_max_scaler_trandform(df,col):
    col = df.columns
    scaler = joblib.load("../model/scaler")
    df[col[1:]] = scaler.transform(df[col[1:]])
    return df

In [493]:
def train_test(df,col):
    X=df[col[1:]]
    Y=df[col[0]]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    return X_train, y_train,X_test,y_test

In [494]:
def clf(X_train,y_train,X_test,y_test):
    
    clf = LogisticRegression()
    clf.fit(X_train,y_train)
    joblib.dump(clf,"../model/clf")
    score = clf.score(X_test,y_test)
    
    return score

In [495]:
def svc(X_train,y_train,X_test,y_test): 
    
    svc = SVC(C=25)
    svc.fit(X_train,y_train)
    score_svc = svc.score(X_test,y_test)
    
    return score_svc

In [496]:
def pipeline(df):
    
    missing_values_mode(df)
    df = missing_value_transform(df)
    fit_gender(df)
    df = transform_gender(df)
    fit_emb(df)
    df = transform_emb(df)
    min_max_scaler_fit(df)
    col = df.columns
    df = min_max_scaler_trandform(df,col)
    X_train,y_train,X_test,y_test=train_test(df,col)
    score_clf = clf(X_train,y_train,X_test,y_test)
    score_svc = svc(X_train,y_train,X_test,y_test)
    return score_clf, score_svc,X_train
    
    


In [497]:
clf, svc,X_train = pipeline(df)

In [498]:
clf

0.8067796610169492

In [499]:
svc

0.8203389830508474