In [1]:
#IMPORTING THE LIBRARIES
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#IMPORTING THE DATASET

def import_csv(name):
    df = pd.read_csv(name)

    ##REMOVING UNNECESARY COLUMNS
    df = df.drop(['PassengerId', 'Ticket', 'Cabin'], axis = 1)
    
    #CHECK MOS REPEATED VALUE AND REPALCE NAN
    df = df.fillna({"Embarked": "S"})
    
    #TAKE TITLES FROM NAMES
    s = df['Name'].apply(lambda st: st[st.find(", ")+1:st.find(". ")]).rename('Title')
                                                                              
    #JOIN TITLE COLUMN TO DF AND DELETE NAME
    df = pd.concat([df, s], axis = 1).drop(['Name'], axis = 1)
                                                                              
    #REMOVING BLANK SPACE FROM TITLE
    df['Title'] = df['Title'].str.strip()
    #CORRECT MS TO MRS
    df.loc[df['Title'] == 'Ms', 'Title'] = 'Mrs'

    #TRANSFORM PCLASS TO TEXT AND CREATE COLUMN TP(TITLE+PCLASS) TO APPLY SPLIT+APPLY+COMBINE TO REMOVE NAN IN AGE
    df['Pclass'] = df['Pclass'].astype(str)
    df['TP'] = df['Title'] + df['Pclass']
                                                                              
    #GROUP BY TP AND TAKE AVG FROM AGE
    grouped = df.groupby('TP').agg(avg = pd.NamedAgg(column = 'Age', aggfunc = 'mean')).reset_index()
                                                                              
    #JOIN AVGS BY TP TO THE DF
    df = df.merge(grouped, on = 'TP', how = 'left')
                                                                              
    #REPLACE EMPTY AGE WITH ITS VALUE IN THE COLUMN AVG
    df['Age'].fillna(df['avg'], inplace=True)
    df = df.drop(['avg'], axis = 1)
        
    #GROUP BY TP AND TAKE AVG FROM FARE
    grouped = df.groupby('TP').agg(avg = pd.NamedAgg(column = 'Fare', aggfunc = 'mean')).reset_index()
                                                                              
    #JOIN DE AVGS POR TP
    df = df.merge(grouped, on = 'TP', how = 'left')
    
    #REPLACE EMPTY FARE WITH ITS VALUE IN THE COLUMN AVG
    df['Fare'].fillna(df['avg'], inplace=True)
    df = df.drop(['avg'], axis = 1)
    
    #DROP COLUMN TITLE AND DO ONE HOT ENCODING
    df = df.drop(['Title'], axis = 1)
    df = pd.get_dummies(df, columns = ['Sex', 'Embarked','TP'])
    
    X = df.loc[:, df.columns != 'Survived']
    return(X)
    
    
X = import_csv('train.csv')
y = pd.read_csv('train.csv').loc[:, 'Survived']

In [3]:
#SPLIT DF INTO TRAIN AND TEST
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 0)
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,...,TP_Mme1,TP_Mr1,TP_Mr2,TP_Mr3,TP_Mrs1,TP_Mrs2,TP_Mrs3,TP_Rev2,TP_Sir1,TP_the Countess1
545,1,64.000000,0,0,26.0000,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
37,3,21.000000,0,0,8.0500,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
214,3,28.724891,1,0,7.7500,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
40,3,40.000000,1,0,9.4750,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
236,2,44.000000,1,0,26.0000,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,1,39.000000,1,1,83.1583,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
192,3,19.000000,1,0,7.8542,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
629,3,28.724891,0,0,7.7333,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
559,3,36.000000,1,0,17.4000,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [4]:
#FEATURE SCALING
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [5]:
#TRAINING LOGISTIC REGRESSION MODEL
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [6]:
#MAKING THE CONFUSION MATRIX
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
acs = accuracy_score(y_test, y_pred)
print(acs)

[[71  8]
 [14 41]]
0.835820895522388


In [7]:
#APLLYING K-FOLD CROSS VALIDATION
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, scoring = 'accuracy')
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.49 %
Standard Deviation: 2.76 %


In [19]:
#TRAINING WITH TEST DATA FOR CHALLENGE
X_test = import_csv('test.csv')

#PAIRING TRAIN COLUMNS WITH TEST COLUMNS IF THEY ARE DIFFERENT
missing_cols = set(X.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X.columns]

#TRAINING
id = pd.read_csv('test.csv')
id = id['PassengerId']
X_test = sc.transform(X_test)
y_pred = classifier.predict(X_test)
y_pred = pd.Series(y_pred)
res = pd.concat([id, y_pred], axis = 1)
res = res.rename(columns={ res.columns[1]: "Survived" })
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [20]:
res.to_csv(r'res.csv', index = False)

In [17]:
for col in res.columns:
    print(col)

PassengerId
0
