In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv(r"/kaggle/input/titanic/train.csv")

In [None]:
df_test = pd.read_csv(r"/kaggle/input/titanic/test.csv")

In [None]:
df_gender_submission = pd.read_csv(r"/kaggle/input/titanic/test.csv")

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_gender_submission.shape

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
df_train['Cabin'].isna().value_counts()

In [None]:
df_test['Cabin'].isna().value_counts()

In [None]:
df_train['Survived'].value_counts()

In [None]:
# Count missing values in 'Cabin' column for rows where 'Survived' is False
missing_cabin_count_for_not_survived = df_train[df_train['Survived'] == 0]['Cabin'].isna().sum()

print("Number of missing 'Cabin' values for non-survivors:", missing_cabin_count_for_not_survived)

In [None]:
df_train['Embarked'].value_counts()

In [None]:
#removing the columns that are not required for prediction
columns_to_drop = ["PassengerId","Name","Embarked"]
df_train.drop(columns = columns_to_drop,inplace=True)

In [None]:
df_train.shape

In [None]:
df_test.drop(columns = ["PassengerId","Name","Embarked"],inplace=True)

In [None]:
df_test.shape

In [None]:
df_train.columns

In [None]:
df_test.columns

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Impute missing values in numeric features (e.g., 'Age' and 'Fare') with mean values from the training data
mean_age = df_train['Age'].mean()
mean_fare = df_train['Fare'].mean()

df_train['Age'].fillna(mean_age, inplace=True)
df_train['Fare'].fillna(mean_fare, inplace=True)

df_train = df_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived']]

# Encode categorical variables (Sex) into numeric values
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})

# Split the dataset into features (X) and target variable (y)
X = df_train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df_train['Survived']

In [None]:
df_train.isna().value_counts()

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

In [None]:
y_pred = logistic_regression.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

In [None]:
accuracy

In [None]:
confusion

In [None]:
classification_report_str

In [None]:
from sklearn.model_selection import GridSearchCV

# Defining hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l2']              # Regularization type
}

# Creating a new logistic regression model
logistic_regression = LogisticRegression(solver='lbfgs', max_iter=1000)

# Performing grid search with cross-validation
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
best_params

In [None]:
best_model

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{confusion}')
print('Classification Report:\n', classification_report_str)

In [None]:
df_test['Sex'].value_counts()

In [None]:
# Encode categorical variables (Sex) into numeric values
df_test['Sex'] = df_test['Sex'].map({'male': 0, 'female': 1})

In [None]:
# Assuming you have loaded and preprocessed your test dataset as 'test_data'

# Impute missing values in numeric features (e.g., 'Age' and 'Fare') with mean values from the training data
mean_age = df_test['Age'].mean()
mean_fare = df_test['Fare'].mean()

df_test['Age'].fillna(mean_age, inplace=True)
df_test['Fare'].fillna(mean_fare, inplace=True)


# Ensure that 'test_data' contains only the same features used during training
# Drop any additional columns that are not needed for prediction
df_test = df_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

In [None]:
df_test = df_test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
# Make predictions using the trained Logistic Regression model
test_predictions = best_model.predict(df_test)

# 'test_predictions' now contains the predicted values (0 or 1) for each data point in the test dataset

df_test['Survived'] = test_predictions
#["PassengerId","Name","Embarked"]
df_prepare = pd.read_csv(r"/kaggle/input/titanic/test.csv")
df_test['Embarked'] = df_prepare['Embarked']
df_test['PassengerId'] = df_prepare['PassengerId']
df_test['Name'] = df_prepare['Name']


df_test.to_csv('test_predictions.csv', index=False)

In [None]:
df_test