In [149]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

import joblib

In [150]:
df = pd.read_csv("titanic.csv")
# print(df.head())
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='str')

In [151]:
df.head(10)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [152]:
a = [1,2,3,4,5,6,7,8,9,10]
mean = 5
median = 5.5
mode = 4.5

In [153]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   str    
 3   sex        1309 non-null   str    
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   str    
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    str    
 10  embarked   1307 non-null   str    
 11  boat       486 non-null    str    
 12  body       121 non-null    float64
 13  home.dest  745 non-null    str    
dtypes: float64(3), int64(4), str(7)
memory usage: 210.3 KB


In [154]:
df.keys

<bound method NDFrame.keys of       pclass  survived                                             name  \
0          1         1                    Allen, Miss. Elisabeth Walton   
1          1         1                   Allison, Master. Hudson Trevor   
2          1         0                     Allison, Miss. Helen Loraine   
3          1         0             Allison, Mr. Hudson Joshua Creighton   
4          1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
...      ...       ...                                              ...   
1304       3         0                             Zabour, Miss. Hileni   
1305       3         0                            Zabour, Miss. Thamine   
1306       3         0                        Zakarian, Mr. Mapriededer   
1307       3         0                              Zakarian, Mr. Ortin   
1308       3         0                               Zimmerman, Mr. Leo   

         sex    age  sibsp  parch  ticket      fare    cabin embarked

In [155]:
X = df.drop(columns=["survived", "name", "ticket", "cabin", "boat"])
y = df["survived"]
X

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,body,home.dest
0,1,female,29.00,0,0,211.3375,S,,"St Louis, MO"
1,1,male,0.92,1,2,151.5500,S,,"Montreal, PQ / Chesterville, ON"
2,1,female,2.00,1,2,151.5500,S,,"Montreal, PQ / Chesterville, ON"
3,1,male,30.00,1,2,151.5500,S,135.0,"Montreal, PQ / Chesterville, ON"
4,1,female,25.00,1,2,151.5500,S,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...
1304,3,female,14.50,1,0,14.4542,C,328.0,
1305,3,female,,1,0,14.4542,C,,
1306,3,male,26.50,0,0,7.2250,C,304.0,
1307,3,male,27.00,0,0,7.2250,C,,


In [156]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,body,home.dest
0,1,female,29.0,0,0,211.3375,S,,"St Louis, MO"
1,1,male,0.92,1,2,151.55,S,,"Montreal, PQ / Chesterville, ON"
2,1,female,2.0,1,2,151.55,S,,"Montreal, PQ / Chesterville, ON"
3,1,male,30.0,1,2,151.55,S,135.0,"Montreal, PQ / Chesterville, ON"
4,1,female,25.0,1,2,151.55,S,,"Montreal, PQ / Chesterville, ON"


In [157]:
numeric_features = ["age", "fare", "sibsp", "parch"]
categorical_features = ["sex", "embarked", "home.dest"]

In [158]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", pd.get_dummies)
])


In [159]:
# Missing values
X["age"].fillna(X["age"].median(), inplace=True)
X["embarked"].fillna(X["embarked"].mode()[0], inplace=True)
X["home.dest"].fillna(X["home.dest"].mode()[0], inplace=True)
X["fare"].fillna(X["fare"].median(), inplace=True)
# Удаляем строки с оставшимися NaN значениями
X = X.dropna()
y = y[X.index]
X.info()

<class 'pandas.DataFrame'>
Index: 73 entries, 3 to 1089
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     73 non-null     int64  
 1   sex        73 non-null     str    
 2   age        73 non-null     float64
 3   sibsp      73 non-null     int64  
 4   parch      73 non-null     int64  
 5   fare       73 non-null     float64
 6   embarked   73 non-null     str    
 7   body       73 non-null     float64
 8   home.dest  73 non-null     str    
dtypes: float64(3), int64(3), str(3)
memory usage: 7.4 KB


C:\Users\User\AppData\Local\Temp\ipykernel_7896\83591564.py:2: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  X["age"].fillna(X["age"].median(), inplace=True)
C:\Users\User\AppData\Local\Temp\ipykernel_7896\83591564.py:3: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using a

In [160]:
X.info()

<class 'pandas.DataFrame'>
Index: 73 entries, 3 to 1089
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     73 non-null     int64  
 1   sex        73 non-null     str    
 2   age        73 non-null     float64
 3   sibsp      73 non-null     int64  
 4   parch      73 non-null     int64  
 5   fare       73 non-null     float64
 6   embarked   73 non-null     str    
 7   body       73 non-null     float64
 8   home.dest  73 non-null     str    
dtypes: float64(3), int64(3), str(3)
memory usage: 7.4 KB


In [161]:
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

# Scaling
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

In [162]:
X.isna().sum()  # Check for missing values

pclass                                     0
age                                        0
sibsp                                      0
parch                                      0
fare                                       0
                                          ..
home.dest_West Kensington, London          0
home.dest_Weston-Super-Mare, Somerset      0
home.dest_Windsor, England New York, NY    0
home.dest_Winnipeg, MB                     0
home.dest_Worcester, MA                    0
Length: 76, dtype: int64

In [163]:
X.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,body,sex_male,embarked_Q,embarked_S,home.dest_Argentina,...,"home.dest_Sweden Worcester, MA","home.dest_Sweden / Arlington, NJ","home.dest_Sydney, Australia","home.dest_Vancouver, BC","home.dest_West Hampstead, London / Neepawa, MB","home.dest_West Kensington, London","home.dest_Weston-Super-Mare, Somerset","home.dest_Windsor, England New York, NY","home.dest_Winnipeg, MB","home.dest_Worcester, MA"
3,1,-0.705118,1.185999,2.348632,2.496272,135.0,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
9,1,2.238283,-0.617708,-0.35137,0.200751,22.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10,1,0.515317,1.185999,-0.35137,4.205331,124.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
25,1,-1.06407,-0.617708,-0.35137,-0.327976,148.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
39,1,0.587107,-0.617708,-0.35137,0.223057,208.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [164]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [165]:
# Check unique values and counts in target variable
print("Unique classes:", y.unique())
print("Class counts:\n", y.value_counts())

Unique classes: [0]
Class counts:
 survived
0    73
Name: count, dtype: int64


In [166]:
X

Unnamed: 0,pclass,age,sibsp,parch,fare,body,sex_male,embarked_Q,embarked_S,home.dest_Argentina,...,"home.dest_Sweden Worcester, MA","home.dest_Sweden / Arlington, NJ","home.dest_Sydney, Australia","home.dest_Vancouver, BC","home.dest_West Hampstead, London / Neepawa, MB","home.dest_West Kensington, London","home.dest_Weston-Super-Mare, Somerset","home.dest_Windsor, England New York, NY","home.dest_Winnipeg, MB","home.dest_Worcester, MA"
3,1,-0.705118,1.185999,2.348632,2.496272,135.0,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
9,1,2.238283,-0.617708,-0.351370,0.200751,22.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10,1,0.515317,1.185999,-0.351370,4.205331,124.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
25,1,-1.064070,-0.617708,-0.351370,-0.327976,148.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
39,1,0.587107,-0.617708,-0.351370,0.223057,208.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748,3,-0.417957,1.185999,0.998631,-0.588919,197.0,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
781,3,-0.489747,-0.617708,-0.351370,-0.735231,51.0,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
797,3,0.048680,-0.617708,-0.351370,-0.738510,68.0,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
837,3,-0.202586,2.989706,-0.351370,-0.734574,98.0,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [167]:
# lr = LogisticRegression(max_iter=1000)
# lr.fit(X_train, y_train)

# lr_pred = lr.predict(X_test)
# lr_acc = accuracy_score(y_test, lr_pred)

# print("Logistic Regression Accuracy:", lr_acc)
print("Логистическая регрессия пропущена из-за несбалансировки классов в выборке")

Логистическая регрессия пропущена из-за несбалансировки классов в выборке


In [169]:
from sklearn.ensemble import HistGradientBoostingClassifier

gb = HistGradientBoostingClassifier(
    max_iter=200,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
gb.fit(X_train, y_train)

gb_pred = gb.predict(X_test)
gb_acc = accuracy_score(y_test, gb_pred)

print("HistGradientBoosting Accuracy:", gb_acc)

HistGradientBoosting Accuracy: 1.0


In [171]:
# svm = SVC(kernel="rbf", probability=True)
# svm.fit(X_train, y_train)

# svm_pred = svm.predict(X_test)
# svm_acc = accuracy_score(y_test, svm_pred)

# print("SVM Accuracy:", svm_acc)
print("SVM пропущен из-за несбалансировки классов в выборке")

SVM пропущен из-за несбалансировки классов в выборке


In [172]:
results = pd.DataFrame({
    "Model": ["HistGradientBoosting"],
    "Accuracy": [gb_acc]
})

print(results)
print(f"\nМодель достигла {gb_acc*100:.2f}% точности!")

                  Model  Accuracy
0  HistGradientBoosting       1.0

Модель достигла 100.00% точности!


In [173]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, rf_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [15, 137, 15]

In [174]:
# Note: Models are already trained. This cell saves them for later use if needed.

In [175]:
sample_passenger = {
    "pclass": 3,
    "sex": "male",
    "age": 22.0,
    "sibsp": 1,
    "parch": 0,
    "fare": 7.25,
    "embarked": "S",
    "home.dest": "Unknown"
}

sample_df = pd.DataFrame([sample_passenger])

# One-hot encoding (train bilan mos bo'lishi shart!)
sample_df = pd.get_dummies(sample_df, columns=categorical_features, drop_first=True)

# Yo'q ustunlarni qo'shamiz
for col in X.columns:
    if col not in sample_df.columns:
        sample_df[col] = 0

sample_df = sample_df[X.columns]

# Scale
sample_df[numeric_features] = scaler.transform(sample_df[numeric_features])

prediction = rf.predict(sample_df)
probability = rf.predict_proba(sample_df)

print("Survived:", "YES" if prediction[0] == 1 else "NO")
print("Probability:", probability)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- body
Feature names seen at fit time, yet now missing:
- home.dest_Aberdeen / Portland, OR
- home.dest_Albany, NY
- home.dest_Altdorf, Switzerland
- home.dest_Amenia, ND
- home.dest_Antwerp, Belgium / Stanton, OH
- ...
