In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train_data = pd.read_csv('train.csv')



#One hot encoding 
train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked', 'Pclass'], dtype=int)

#Handle Missing data 
train_data['Age']=train_data['Age'].fillna(0)
train_data['Cabin'] = train_data['Cabin'].fillna(train_data['Age'].mean())
train_data['Fare'] = train_data['Fare'].fillna(train_data['Fare'].median())  # 'Fare' may have a different distribution



#Corelation between Fare and survival
correlation = train_data[['Fare', 'Survived']].corr()


# #z-score normalazation for logistic regresion 
# train_data['Age'] = zscore(train_data['Age'])
# train_data['Fare'] = zscore(train_data['Fare'])
#Analysis if someone with cabin has more survival chances 

train_data['HasCabin'] = train_data['Cabin'].apply(lambda x: 1 if isinstance(x, str) and len(x) > 0 else 0)
contingency_table = pd.crosstab(train_data['HasCabin'], train_data['Survived'])
chi2, p, dof, expected = chi2_contingency(contingency_table)



test_data = pd.read_csv('test.csv')



# Perform the same preprocessing steps as with the training data:
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked', 'Pclass'], dtype=int)
test_data['Age'] = test_data['Age'].fillna(train_data['Age'].mean())  # Use the training data's mean for consistency
test_data['Cabin'] = test_data['Cabin'].fillna(0)
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())  # 'Fare' may have a different distribution
# Add HasCabin to the test dataset
test_data['HasCabin'] = test_data['Cabin'].apply(lambda x: 1 if isinstance(x, str) and len(x) > 0 else 0)


# Ensure the columns match between train and test datasets
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)  # X.columns comes from your train dataset





#TRAIN THE MODEL WITH LOGISTIC REGRESSION 

#Separate features x and y (x is the train and y is only the survival) 

X = train_data.drop(columns=['Survived','Name','Cabin','Ticket'])
y = train_data['Survived']

#Spliting data into training and validation (80/20)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)

#Initialize the Logistic Regression 
model = LogisticRegression(max_iter=500)

#Train model 
model.fit(X_train, y_train)

#Predict on validation 

y_pred = model.predict(X_val)



# Evaluate performance
print("Accuracy on Validation Set: ", accuracy_score(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

test_data = test_data.drop(columns=['Name','Cabin','Ticket','Survived'])
test_prediction = model.predict(test_data) 

test_results = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_prediction
})

test_results.to_csv('submission.csv', index=False)

test_results


print(test_data)


Accuracy on Validation Set:  0.7597765363128491
[[84 20]
 [23 52]]
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       104
           1       0.72      0.69      0.71        75

    accuracy                           0.76       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.76      0.76       179

     PassengerId        Age  SibSp  Parch      Fare  Sex_female  Sex_male  \
0            892  34.500000      0      0    7.8292           0         1   
1            893  47.000000      1      0    7.0000           1         0   
2            894  62.000000      0      0    9.6875           0         1   
3            895  27.000000      0      0    8.6625           0         1   
4            896  22.000000      1      1   12.2875           1         0   
..           ...        ...    ...    ...       ...         ...       ...   
413         1305  23.799293      0      0    8.0500           0    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
