In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/train.csv")

print(df.shape)
print(df.columns.tolist())
df.head()


(891, 12)
['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

177 kişi yaş bilgisi eksik
687 kişi kabin bilgisi yok
2 kişi için embark noktası eksik

df.info() → sütunların tipini ve boş/dolu sayısını gösterir.

df.isnull().sum() → doğrudan hangi sütunda kaç tane eksik veri olduğunu sayar.

önce yeni özellikleri üretir, eksikleri doldurur, kategorikleri sayıya çevirir.

In [2]:
# 1) Name -> Title (ünvan)
df["Title"] = df["Name"].str.extract(r" ([A-Za-z]+)\.")
df["Title"] = (df["Title"]
               .replace(['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')
               .replace(['Mlle','Ms'], 'Miss')
               .replace('Mme', 'Mrs'))

# 2) Cabin -> Deck (güverte, ilk harf). NaN -> 'U' (Unknown)
deck = df["Cabin"].astype(str).str[0]
df["Deck"] = deck.replace("n", "U")

# 3) Aile özellikleri
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"]    = (df["FamilySize"] == 1).astype(int)

# 4) Eksik değerleri doldur
df["Age"]      = df["Age"].fillna(df["Age"].median())
df["Fare"]     = df["Fare"].fillna(df["Fare"].median())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])

# 5) Kategorikleri sayıya çevir
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df = pd.get_dummies(df, columns=["Title","Embarked","Deck"], drop_first=True)

# 6) (Opsiyonel) ham sütunları kaldır (bilgi artık Title/Deck dummies'lerinde var)
# İstersen # işaretini kaldır:
# df = df.drop(columns=["PassengerId","Name","Ticket","Cabin"])

# 7) (Opsiyonel) Age/Fare binning (kuantil dilimleri)
df["AgeBin"]  = pd.qcut(df["Age"], 4, labels=False)
df["FareBin"] = pd.qcut(df["Fare"], 4, labels=False)

print("Şekil:", df.shape)
df.head()


Şekil: (891, 29)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_U,AgeBin,FareBin
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,...,False,False,False,False,False,False,False,True,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,...,False,True,False,False,False,False,False,False,3,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,...,False,False,False,False,False,False,False,True,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,...,False,True,False,False,False,False,False,False,2,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,...,False,False,False,False,False,False,False,True,2,1


In [6]:
# === 6) (Opsiyonel) ham sütunları kaldır (model için gereksiz/tekst)
drop_if_exists = ["PassengerId", "Name", "Ticket", "Cabin"]
df_model = df.drop(columns=[c for c in drop_if_exists if c in df.columns]).copy()

# === 7) (Opsiyonel) Age/Fare binning (kuantil dilimleri)
# (istersen bu iki satırı kapatabilirsin; sürekli 'Age' ve 'Fare' zaten duruyor)
df_model["AgeBin"]  = pd.qcut(df_model["Age"],  4, labels=False)
df_model["FareBin"] = pd.qcut(df_model["Fare"], 4, labels=False)

# === 8) Hedef/özellikleri ayır
X = df_model.drop("Survived", axis=1)
y = df_model["Survived"].astype(int)

# Güvence: sadece sayısal/boolean kalsın, bool -> int
X = X.select_dtypes(include=["number", "bool"]).copy()
for c in X.columns:
    if X[c].dtype == bool:
        X[c] = X[c].astype(int)

print("X shape:", X.shape, "| y shape:", y.shape)

# === 9) Train/Test böl
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === 10) Modeller: Logistic Regression ve Random Forest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Model 1: Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
pred_lr = logreg.predict(X_test)

# Model 2: Random Forest
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)

# === 11) Sonuçlar
def rapor(y_true, y_pred, baslik):
    print(f"\n=== {baslik} ===")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1       :", f1_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))

rapor(y_test, pred_lr, "Logistic Regression")
rapor(y_test, pred_rf, "Random Forest")

# === 12) (Opsiyonel) RF feature importance (en önemli 15)
import pandas as pd
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\n=== Random Forest - En önemli 15 özellik ===")
print(imp.head(15))


X shape: (891, 24) | y shape: (891,)

=== Logistic Regression ===
Accuracy : 0.8268156424581006
Precision: 0.796875
Recall   : 0.7391304347826086
F1       : 0.7669172932330827
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       110
           1       0.80      0.74      0.77        69

    accuracy                           0.83       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.83      0.83      0.83       179


=== Random Forest ===
Accuracy : 0.8156424581005587
Precision: 0.7903225806451613
Recall   : 0.7101449275362319
F1       : 0.7480916030534351
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179


=== Random Forest - En öne