# Data Loading

In [29]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [30]:
df = pd.read_csv("cleaned-data.csv")

# data Exploraring

In [3]:
print(f"Number of rows: {df.shape[0]}/n Number of columns: {df.shape[1]}")

Number of rows: 418/n Number of columns: 9


In [4]:
df.head(4)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,34.5,0,0,7.8292,76,1
1,1,3,0,47.0,1,0,7.0,76,2
2,0,2,1,62.0,0,0,9.6875,76,1
3,0,3,1,27.0,0,0,8.6625,76,2


In [5]:
df.tail(4)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
414,1,1,0,39.0,0,0,108.9,22,0
415,0,3,1,38.5,0,0,7.25,76,2
416,0,3,1,,0,0,8.05,76,2
417,0,3,1,,1,1,22.3583,76,0


In [6]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0,418.0,418.0
mean,0.363636,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,67.437799,1.401914
std,0.481622,0.841838,0.481622,14.181209,0.89676,0.981429,55.907576,19.091405,0.854496
min,0.0,1.0,0.0,0.17,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,21.0,0.0,0.0,7.8958,76.0,1.0
50%,0.0,3.0,1.0,27.0,0.0,0.0,14.4542,76.0,2.0
75%,1.0,3.0,1.0,39.0,1.0,0.0,31.5,76.0,2.0
max,1.0,3.0,1.0,76.0,8.0,9.0,512.3292,76.0,2.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    int64  
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Cabin     418 non-null    int64  
 8   Embarked  418 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.5 KB


In [8]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

In [9]:
for i in df.columns:
    print(i)

Survived
Pclass
Sex
Age
SibSp
Parch
Fare
Cabin
Embarked


In [10]:
df.count()

Survived    418
Pclass      418
Sex         418
Age         332
SibSp       418
Parch       418
Fare        417
Cabin       418
Embarked    418
dtype: int64

In [11]:
df["Cabin"].unique()

array([76, 12, 60, 15, 10,  2, 41, 55, 50,  5, 49, 30, 28, 71, 17, 35, 59,
       25, 26, 21, 37, 19, 33, 24, 68,  3, 75, 38, 29, 34, 64, 36, 46, 52,
        6, 74, 63, 66, 54, 16, 61, 39,  0,  7, 42, 73, 43, 56, 44, 51, 45,
       69,  4, 47,  9, 27, 18, 31, 20, 70, 72,  1, 23, 13, 48, 67, 65, 62,
       14, 32,  8, 53, 11, 40, 58, 57, 22])

In [12]:
df.nunique()

Survived      2
Pclass        3
Sex           2
Age          79
SibSp         7
Parch         8
Fare        169
Cabin        77
Embarked      3
dtype: int64

In [13]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Cabin        0
Embarked     0
dtype: int64

# Data Pre_processing

In [14]:
df = pd.read_csv("cleaned-data.csv")
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,34.5,0,0,7.8292,76,1
1,1,3,0,47.0,1,0,7.0000,76,2
2,0,2,1,62.0,0,0,9.6875,76,1
3,0,3,1,27.0,0,0,8.6625,76,2
4,1,3,0,22.0,1,1,12.2875,76,2
...,...,...,...,...,...,...,...,...,...
413,0,3,1,,0,0,8.0500,76,2
414,1,1,0,39.0,0,0,108.9000,22,0
415,0,3,1,38.5,0,0,7.2500,76,2
416,0,3,1,,0,0,8.0500,76,2


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    int64  
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Cabin     418 non-null    int64  
 8   Embarked  418 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.5 KB


In [16]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Cabin        0
Embarked     0
dtype: int64

# Dealing with null values

In [31]:
# N/A, NA, Null, "", NONE, NaN
#    Mean, Median, Mode
# Average, Center, Most Reccuring Value 

# df['Cabin'] = df['Cabin'].fillna(df['Cabin'].mean())

for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].mean())

In [18]:
def fillNaMode(cols):
    for i in cols:
        df[i] = df[i].fillna(df[i].mode()[0])

columns = [ 'Age', "Cabin"]
fillNaMode(columns)

In [19]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        1
Cabin       0
Embarked    0
dtype: int64

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    int64  
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Cabin     418 non-null    int64  
 8   Embarked  418 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.5 KB


# Dropping irrelevant columns

In [22]:
df = pd.read_csv("cleaned-data.csv")
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,34.5,0,0,7.8292,76,1
1,1,3,0,47.0,1,0,7.0000,76,2
2,0,2,1,62.0,0,0,9.6875,76,1
3,0,3,1,27.0,0,0,8.6625,76,2
4,1,3,0,22.0,1,1,12.2875,76,2
...,...,...,...,...,...,...,...,...,...
413,0,3,1,,0,0,8.0500,76,2
414,1,1,0,39.0,0,0,108.9000,22,0
415,0,3,1,38.5,0,0,7.2500,76,2
416,0,3,1,,0,0,8.0500,76,2


In [33]:
# Drop columns only if they exist in the dataframe
# columns_to_drop = ['PassengerId', 'Name', 'Ticket']
# df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)
# df
X = df.drop('Survived', axis=1)
y = df['Survived']

In [None]:
# df['survived'] = df['survived'].astype('int64')

def changeFloattoInt64(cols):
    for i in cols:
        df[i] = df[i].astype('int64')

columns = ['Survived', 'Pclass', 'SibSp', 'Parch']
changeFloattoInt64(columns)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Cabin     91 non-null     object 
 8   Embarked  418 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB


In [None]:
df["Cabin"].unique()

array([nan, 'B45', 'E31', 'B57 B59 B63 B66', 'B36', 'A21', 'C78', 'D34',
       'D19', 'A9', 'D15', 'C31', 'C23 C25 C27', 'F G63', 'B61', 'C53',
       'D43', 'C130', 'C132', 'C101', 'C55 C57', 'B71', 'C46', 'C116',
       'F', 'A29', 'G6', 'C6', 'C28', 'C51', 'E46', 'C54', 'C97', 'D22',
       'B10', 'F4', 'E45', 'E52', 'D30', 'B58 B60', 'E34', 'C62 C64',
       'A11', 'B11', 'C80', 'F33', 'C85', 'D37', 'C86', 'D21', 'C89',
       'F E46', 'A34', 'D', 'B26', 'C22 C26', 'B69', 'C32', 'B78',
       'F E57', 'F2', 'A18', 'C106', 'B51 B53 B55', 'D10 D12', 'E60',
       'E50', 'E39 E41', 'B52 B54 B56', 'C39', 'B24', 'D28', 'B41', 'C7',
       'D40', 'D38', 'C105'], dtype=object)

In [None]:
df_clean = df.copy()

In [32]:
# def encodeCols(cols):
#     for i in cols:
#         temp = pd.DataFrame({i: df[i].unique()})
#         data_LE = LabelEncoder()
#         data_LE.fit(np.ravel(temp))
#         df_clean[i] = data_LE.transform(df[i])

# # Choose categorical columns to encode
# columns = ['Sex', 'Cabin', 'Embarked']
# encodeCols(columns)
le = LabelEncoder()
for col in ['Sex', 'Cabin', 'Embarked']:
    if df[col].dtype == 'O' or str(df[col].dtype).startswith('object'):
        df[col] = le.fit_transform(df[col])

In [None]:
df_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,34.5,0,0,7.8292,76,1
1,1,3,0,47.0,1,0,7.0,76,2
2,0,2,1,62.0,0,0,9.6875,76,1
3,0,3,1,27.0,0,0,8.6625,76,2
4,1,3,0,22.0,1,1,12.2875,76,2


In [None]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    int64  
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Cabin     418 non-null    int64  
 8   Embarked  418 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.5 KB


In [None]:
df_clean.to_csv(r'cleaned-data.csv', index=False, header=True)

In [None]:
import numpy as np
df2 = pd.DataFrame({
    'A': [1,2,3,'?'],
    'B': [2,3,'?',4]
})

df2.replace("?", np.nan, inplace=True)

  df2.replace("?", np.nan, inplace=True)


In [None]:
print(df2)

     A    B
0  1.0  2.0
1  2.0  3.0
2  3.0  NaN
3  NaN  4.0


# Data Splitting / Train-Test Split

In [None]:
# Redefine df_clean if not already defined
df_clean = df.copy()

# Method 1
X = df_clean.iloc[:, 1:]
y = df_clean.iloc[:, 0]

In [36]:
# from sklearn.model_selection import train_test_split
# trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True)

In [37]:
# trainX.head(2)

In [38]:
# trainY.head(2)

In [39]:
# print("TrainX:",trainX.shape)
# print("TrainY:",trainY.shape)
# print("TestX:",testX.shape)
# print("TestY:",testY.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_SVC = SVC(C=10, kernel='rbf', gamma='scale', random_state=42)
model_SVC.fit(X_train_scaled, y_train)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training / Applying Classifier

In [None]:
import pickle
from sklearn.svm import SVC

In [None]:
# Fill missing values in X_train with the mean of each column
X_train = X_train.fillna(X_train.mean())

model_SVC = SVC()
model_SVC.fit(X_train, y_train)

print(model_SVC)

SVC()


# Testing & Processing Results

In [None]:
# Fill missing values in X_test with the mean of each column
X_test = X_test.fillna(X_test.mean())

svc_predictions = model_SVC.predict(X_test)

In [None]:
print(svc_predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0]


# Accuracy Score

In [41]:
# Ensure the model is trained
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_SVC = SVC(C=10, kernel='rbf', gamma='scale', random_state=42)
model_SVC.fit(X_train_scaled, y_train)

# Make predictions and calculate accuracy
svc_predictions = model_SVC.predict(X_test_scaled)
svc_accuracy = accuracy_score(y_test, svc_predictions)
print("-- Model Accuracy: ", round(svc_accuracy, 3))

-- Model Accuracy:  0.988
