In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [3]:
# Read the CSV files
gender_submission_df = pd.read_csv('./gender_submission.csv')
test_df = pd.read_csv('./test.csv')
train_df = pd.read_csv('./train.csv')

# Print the shape of each DataFrame
print("gender_submission shape:", gender_submission_df.shape)
print("test shape:", test_df.shape)
print("train shape:", train_df.shape)

# Display the first few rows of each DataFrame
print("gender_submission head:")
print(gender_submission_df.head())

print("test head:")
print(test_df.head())

print("train head:")
print(train_df.head())
#df = pd.read_csv('./gender_submission.csv''./test.csv''./train.csv')
#print(df.shape)
#df.head()

gender_submission shape: (418, 2)
test shape: (418, 11)
train shape: (891, 12)
gender_submission head:
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
test head:
   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315

In [5]:
# Drop rows with missing values in specific columns
columns_to_check = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
df = train_df.dropna(subset=columns_to_check)

# Display the shape of the resulting DataFrame
print(df.shape)


(714, 12)


In [7]:
# Assuming 'Pclass' is equivalent to 'ticketclass'
grouped = df.groupby('Pclass').sum()

# Display the grouped data
print(grouped)


        PassengerId  Survived  \
Pclass                          
1             86297       122   
2             77358        83   
3            156633        85   

                                                     Name  \
Pclass                                                      
1       Cumings, Mrs. John Bradley (Florence Briggs Th...   
2       Nasser, Mrs. Nicholas (Adele Achem)Hewlett, Mr...   
3       Braund, Mr. Owen HarrisHeikkinen, Miss. LainaA...   

                                                      Sex      Age  SibSp  \
Pclass                                                                      
1       femalefemalemalefemalemalemalemalemalemalefema...  7111.42     84   
2       femalefemalemalemalemalefemalefemalefemalefema...  5168.83     74   
3       malefemalemalemalefemalefemalemalemalefemalema...  8924.92    208   

        Parch                                             Ticket        Fare  \
Pclass                                                        

In [33]:
# Assuming 'Pclass' is equivalent to 'class'
df = pd.get_dummies(df, columns=['Pclass', 'Sex'], drop_first=True)

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Pclass_2,Pclass_3,Sex_male
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,S,False,True,True
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,False,False,False
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,S,False,True,False
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,S,False,False,False
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,S,False,True,True


In [60]:
y = df['Survived']  # Correct column name
x = df.drop(['Survived'], axis=1)  # Drop the 'Survived' column and use the rest as features

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=10)


In [93]:
# Assuming you have one-hot encoded columns 'Pclass_2', 'Pclass_3', 'Sex_male'
feature_columns = ['Pclass_2', 'Pclass_3', 'Sex_male', 'Age', 'SibSp', 'Parch', 'Fare']
x = df[feature_columns]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=10)


In [94]:
m = MLPClassifier(max_iter=1000).fit(xtrain, ytrain)

In [95]:
MLPClassifier().get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [96]:
pred = m.predict(xtest)
print(metrics.accuracy_score(ytest, pred))

0.8156424581005587


In [97]:
print(metrics.confusion_matrix(ytest, pred))

[[91 12]
 [21 55]]


In [98]:
print(metrics.classification_report(ytest, pred))

              precision    recall  f1-score   support

           0       0.81      0.88      0.85       103
           1       0.82      0.72      0.77        76

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.81       179



In [99]:
# alternative

print('Accuracy', metrics.accuracy_score(ytest, pred))
print('Precision', metrics.accuracy_score(ytest, pred))
print('Recall', metrics.recall_score(ytest, pred))
print('F1 Score', metrics.f1_score(ytest,pred))


Accuracy 0.8156424581005587
Precision 0.8156424581005587
Recall 0.7236842105263158
F1 Score 0.7692307692307693


In [100]:
import joblib
from sklearn.model_selection import GridSearchCV

In [102]:
def best(result):
    print('Best parameters: {}\n'.format(result.best_params_))
    mean = result.cv_results_['mean_test_score']
    sd = result.cv_results_['std_test_score']
    for m, s, p in zip(mean, sd, result.cv_results_['params']):
        print('{} (+/- {}) for {}'.format(m, s*2, p))

# Call the function with your result object
# best(your_result_object)


In [104]:
parameters = {
    'hidden_layer_sizes': [(18,), (20,), (30,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

cv = GridSearchCV(m, parameters, cv=5)
cv.fit(xtrain, ytrain)
best(cv)


Best parameters: {'activation': 'logistic', 'hidden_layer_sizes': (18,), 'learning_rate': 'invscaling'}

0.7626168224299065 (+/- 0.05365495362395262) for {'activation': 'relu', 'hidden_layer_sizes': (18,), 'learning_rate': 'constant'}
0.7906542056074767 (+/- 0.048164855053178045) for {'activation': 'relu', 'hidden_layer_sizes': (18,), 'learning_rate': 'invscaling'}
0.7775700934579439 (+/- 0.03624433538255199) for {'activation': 'relu', 'hidden_layer_sizes': (18,), 'learning_rate': 'adaptive'}
0.7794392523364484 (+/- 0.05098385681116209) for {'activation': 'relu', 'hidden_layer_sizes': (20,), 'learning_rate': 'constant'}
0.7887850467289719 (+/- 0.03466773269344189) for {'activation': 'relu', 'hidden_layer_sizes': (20,), 'learning_rate': 'invscaling'}
0.794392523364486 (+/- 0.04728639491840568) for {'activation': 'relu', 'hidden_layer_sizes': (20,), 'learning_rate': 'adaptive'}
0.7757009345794392 (+/- 0.05544821298763113) for {'activation': 'relu', 'hidden_layer_sizes': (30,), 'learning_



In [105]:
cv.best_estimator_

In [106]:
df = pd.read_csv('./gender_submission.csv')
df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [108]:
print(df.columns)


Index(['PassengerId', 'Survived'], dtype='object')


In [109]:
y = df['Survived']
x = df.drop(['Survived'], axis=1)  # Drop 'Survived' column from features

xtrain, xtest, ytrain, ytest = train_test_split(x, y)


In [110]:
m1 = MLPRegressor(max_iter=1000).fit(xtrain, ytrain)

In [111]:
pred = m1.predict(xtest)
metrics.mean_squared_error(pred, ytest)


64.46609043391236