## 🔎Titanic Dataset (Voting Classifier)

In [143]:
import pandas as pd
import warnings as wg
import numpy as np
wg.filterwarnings('ignore')

In [144]:
#let's import dataset
df = pd.read_csv("tested.csv")

In [145]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [146]:
#Let's look at info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [147]:
#Let's look at shape of dataset
df.shape

(418, 12)

In [148]:
#Let's look at number of unique values in dataset
df.nunique()

PassengerId    418
Survived         2
Pclass           3
Name           418
Sex              2
Age             79
SibSp            7
Parch            8
Ticket         363
Fare           169
Cabin           76
Embarked         3
dtype: int64

In [149]:
#Let's look at reverse of describe of dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,418.0,1100.5,120.810458,892.0,996.25,1100.5,1204.75,1309.0
Survived,418.0,0.363636,0.481622,0.0,0.0,0.0,1.0,1.0
Pclass,418.0,2.26555,0.841838,1.0,1.0,3.0,3.0,3.0
Age,332.0,30.27259,14.181209,0.17,21.0,27.0,39.0,76.0
SibSp,418.0,0.447368,0.89676,0.0,0.0,0.0,1.0,8.0
Parch,418.0,0.392344,0.981429,0.0,0.0,0.0,0.0,9.0
Fare,417.0,35.627188,55.907576,0.0,7.8958,14.4542,31.5,512.3292


In [150]:
#we don't need passengerid , name , ticket for predict
#when do we use id? 🤨?
#when do we use names? 🤨?
#when do we use ticket number? 🤨?

del df['PassengerId']
del df['Name']
del df['Ticket']

### ✔Answers:
#### · We can use passenger id when we need time analyze
#### · We can use passenger name when we do text mining 
#### · We can use ticket number when feature mining (position)

In [151]:
#Let's examine number of null values by each columns
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age          86
SibSp         0
Parch         0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [152]:
#Let's remove columns which contain many null values
del df['Cabin']

### 💡"Pclass" in the Titanic dataset stands for "Passenger Class." It has three categories:
#### · 1st Class (Pclass = 1): Wealthy passengers with luxurious accommodations.
#### · 2nd Class (Pclass = 2): Middle-class passengers with decent accommodations.
#### · 3rd Class (Pclass = 3): Lower-class passengers with basic accommodations.

In [153]:
#Let's look at number of distict values of Pclass columns
df['Pclass'].value_counts()

3    218
1    107
2     93
Name: Pclass, dtype: int64

In [154]:
#Let's find correlation between Survived and Passenger Class
corr = df['Pclass'].corr(df['Survived'])
print(f'· Passenger Class / Survived: {corr}')

· Passenger Class / Survived: -0.10861452091523412


### 💡The "Embarked" column in the Titanic dataset indicates the port where passengers boarded:
#### · C: Cherbourg, France.
#### · Q: Queenstown (Cobh), Ireland.
#### · S: Southampton, England.
#### These letters represent the embarkation points for each passenger on the Titanic.

In [155]:
#Let's look at number of distict values of Embarked columns
df['Embarked'].value_counts()

S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [314]:
#Can i find? 
#corr = df['Embarked'].corr(df['Survived'])
#print(f'· Embarked / Survived: {corr}')

### ✔Answer: ✅ No 
#### (Becouse we can't find correl between categorical and numeric variable) 

In [157]:
#Let's get dummies categorical column which can't be compared with each other
df = pd.get_dummies(data = df , columns = ['Embarked'])

In [158]:
#Let's replace female male with 0/1
df['Sex'] = df['Sex'].map({'male':1 , 'female':0})

In [159]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,34.5,0,0,7.8292,0,1,0
1,1,3,0,47.0,1,0,7.0000,0,0,1
2,0,2,1,62.0,0,0,9.6875,0,1,0
3,0,3,1,27.0,0,0,8.6625,0,0,1
4,1,3,0,22.0,1,1,12.2875,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,0,3,1,,0,0,8.0500,0,0,1
414,1,1,0,39.0,0,0,108.9000,1,0,0
415,0,3,1,38.5,0,0,7.2500,0,0,1
416,0,3,1,,0,0,8.0500,0,0,1


In [161]:
#Let's look at unique values of age column
df['Age'].unique()

array([34.5 , 47.  , 62.  , 27.  , 22.  , 14.  , 30.  , 26.  , 18.  ,
       21.  ,   nan, 46.  , 23.  , 63.  , 24.  , 35.  , 45.  , 55.  ,
        9.  , 48.  , 50.  , 22.5 , 41.  , 33.  , 18.5 , 25.  , 39.  ,
       60.  , 36.  , 20.  , 28.  , 10.  , 17.  , 32.  , 13.  , 31.  ,
       29.  , 28.5 , 32.5 ,  6.  , 67.  , 49.  ,  2.  , 76.  , 43.  ,
       16.  ,  1.  , 12.  , 42.  , 53.  , 26.5 , 40.  , 61.  , 60.5 ,
        7.  , 15.  , 54.  , 64.  , 37.  , 34.  , 11.5 ,  8.  ,  0.33,
       38.  , 57.  , 40.5 ,  0.92, 19.  , 36.5 ,  0.75,  0.83, 58.  ,
        0.17, 59.  , 14.5 , 44.  ,  5.  , 51.  ,  3.  , 38.5 ])

In [162]:
#Let's create dataframe for Age column
age = pd.DataFrame(df['Age'].value_counts())
age.reset_index(inplace = True)

In [163]:
#Let's replace name of columns
age.rename(columns={'index': 'Age', 'Age': 'Count'}, inplace=True)
age

Unnamed: 0,Age,Count
0,21.0,17
1,24.0,17
2,22.0,16
3,30.0,15
4,18.0,13
...,...,...
74,76.0,1
75,28.5,1
76,22.5,1
77,62.0,1


In [164]:
print(age.head(20))
print('==============')
print('· Min age: ' , age.head(20)['Age'].min())
print('· Max age: ' , age.head(20)['Age'].max())

     Age  Count
0   21.0     17
1   24.0     17
2   22.0     16
3   30.0     15
4   18.0     13
5   27.0     12
6   26.0     12
7   25.0     11
8   23.0     11
9   29.0     10
10  45.0      9
11  36.0      9
12  20.0      8
13  17.0      7
14  28.0      7
15  31.0      6
16  39.0      6
17  33.0      6
18  32.0      6
19  55.0      6
· Min age:  17.0
· Max age:  55.0


In [165]:
#Let's examine data types of data frame's columnsdf.dtypes

Survived        int64
Pclass          int64
Sex             int64
Age           float64
SibSp           int64
Parch           int64
Fare          float64
Embarked_C      uint8
Embarked_Q      uint8
Embarked_S      uint8
dtype: object

In [166]:
#I can't fill with ratio
#I can't divide as null and not null
#I can't fill with zero
#I can't fill with mode
#I can't fill with (21.0 , NaN , 46.0 , 23.0)
#But which?
df['Age'][:20]

0     34.5
1     47.0
2     62.0
3     27.0
4     22.0
5     14.0
6     30.0
7     26.0
8     18.0
9     21.0
10     NaN
11    46.0
12    23.0
13    63.0
14    47.0
15    24.0
16    35.0
17    21.0
18    27.0
19    45.0
Name: Age, dtype: float64

### ✔Answer: ✅Bins

In [167]:
#Let's replace null values with median
df['Age'].fillna(df['Age'].median() , inplace = True) 

In [168]:
#Let's check null values
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          1
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [169]:
#Let's remove null values which we have 1 null in fare columns
df.dropna(inplace = True)

### 👩🏻‍💻Normalize

In [170]:
#Let's normalize values
for i in ['Fare','Age']:
    df[i] = (df[i] - df[i].min())/(df[i].max() - df[i].min())

In [174]:
#Let's check several models
#Logistic Regression
#Support Vector Classifier
#Desicion Tree Classifier
#Random Forest Classifier

In [262]:
#Let's import the libraries
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [263]:
#Let's find target and explanatory values
y = df['Survived']
X = df.drop(columns = 'Survived')

In [264]:
#Let's divide data to train and test
from sklearn.model_selection import train_test_split
X_train,X_test , y_train , y_test = train_test_split(X,y,random_state = 42 , test_size = 0.2)

In [267]:
#Let's create model
svc = SVC(probability=True)
dt = DecisionTreeClassifier(max_depth = 5)
knn = KNeighborsClassifier(n_neighbors=3)

In [256]:
#Let's fit train values to model 
svc.fit(X_train, y_train)
dt.fit(X_train, y_train)
knn.fit(X_train, y_train)

In [257]:
#Let's predict test values
y_pred_svc = svc.predict(X_test)
y_pred_dt = dt.predict(X_test)
y_pred_knn = knn.predict(X_test) 

In [258]:
#Let's examine scores
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred_svc))
print(accuracy_score(y_test,y_pred_dt))
print(accuracy_score(y_test,y_pred_knn))

0.9761904761904762
1.0
0.9642857142857143


In [259]:
#['Pclass','Sex','Age','SibSp','Parch','Fare', 'Embarked_C','Embarked_Q','Embarked_S']
print("· Survived: ",dt.predict([[3,1,0.23,1,0,0.2,0,0,1]])[0])

· Survived:  0


In [260]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,0.452723,0,0,0.015282,0,1,0
1,1,3,0,0.617566,1,0,0.013663,0,0,1
2,0,2,1,0.815377,0,0,0.018909,0,1,0
3,0,3,1,0.353818,0,0,0.016908,0,0,1
4,1,3,0,0.287881,1,1,0.023984,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,0,3,1,0.353818,0,0,0.015713,0,0,1
414,1,1,0,0.512066,0,0,0.212559,1,0,0
415,0,3,1,0.505473,0,0,0.014151,0,0,1
416,0,3,1,0.353818,0,0,0.015713,0,0,1


### 💡Voting types:
#### · Hard Voting: Majority rule among classifiers' predictions.
#### · Soft Voting: Averages predicted probabilities for each class.
#### · Weighted Voting: Assigns different influence levels to classifiers.

### 📍Hard

In [290]:
#Let's import library and create the Voting Classifier
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(
    estimators=[('decision_tree', dt), ('knn', knn), ('svm', svc)])
vc.fit(X_train, y_train)
vc = vc.predict(X_test)

In [291]:
#Let's evaluate the model
accuracy = accuracy_score(y_test, vc)
print("·Voting Classifier Accuracy:", accuracy)

·Voting Classifier Accuracy: 0.9761904761904762


### 📍Soft

In [292]:
#Let's import library and create the Voting Classifier
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(
    estimators=[('decision_tree', dt), ('knn', knn), ('svm', svc)],
    voting='soft'  # Use 'soft' for probability-based combination
)
vc.fit(X_train, y_train)
vc = vc.predict(X_test)

In [293]:
#Let's evaluate the model
accuracy = accuracy_score(y_test, vc)
print("·Voting Classifier Accuracy:", accuracy)

·Voting Classifier Accuracy: 0.9880952380952381


### 📍Weighet

In [310]:
#Let's import library and create the Voting Classifier
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(
    estimators=[('decision_tree', dt), ('knn', knn), ('svm', svc)],
    voting='soft' , weights = [1,10,5])
vc.fit(X_train, y_train)
vc = vc.predict(X_test)

In [311]:
#Let's evaluate the model
accuracy = accuracy_score(y_test, vc)
print("·Voting Classifier Accuracy:", accuracy)

·Voting Classifier Accuracy: 0.9642857142857143


## 🖐🏻The End