In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

titanic_data = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/titanic_train.csv")


In [None]:
# Check the head of the dataset
titanic_data.head()

# Check for missing values
titanic_data.isnull().sum()

# Check the data types of each column
titanic_data.info()

# Statistical summary of the dataset
titanic_data.describe()

# Check the distribution of target variable
sns.countplot(x='Survived', data=titanic_data)
plt.show()


In [None]:
# Drop the columns with high missing values
titanic_data.drop(['Cabin'], axis=1, inplace=True)

# Fill the missing values in Age column with the mean value
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)

# Fill the missing values in Embarked column with the mode value
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# Convert categorical columns to numerical columns
sex = pd.get_dummies(titanic_data['Sex'], drop_first=True)
embark = pd.get_dummies(titanic_data['Embarked'], drop_first=True)

# Concatenate the newly created columns to the original dataset
titanic_data = pd.concat([titanic_data, sex, embark], axis=1)

# Drop the unnecessary columns
titanic_data.drop(['PassengerId', 'Sex', 'Embarked', 'Name', 'Ticket'], axis=1, inplace=True)


In [None]:
from sklearn.model_selection import train_test_split

X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)


In [None]:
from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)


In [None]:
predictions = logmodel.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))
