In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
#reading the training dataset
testing=pd.read_csv('/kaggle/input/traindata/train.csv')
testing.info()

In [None]:
testing.describe()

In [None]:
#dropping the cabin column since a significant number of rows have no value
testing=testing.drop("Cabin", axis= 1 )
testing

In [None]:
#distribution of Age
sns.histplot(testing['Age'])
plt.show()

In [None]:
#filling the rows that have no value for age with the mean age
newage=testing.Age.fillna(testing['Age'].mean())
testing['Age']=newage
testing['Age']=testing['Age'].astype(int)

In [None]:
testing=testing.drop(testing[testing['Embarked'].isna()].index)
testing.describe()

In [None]:
#Relationship between Age and survival, fare and survival
fig, ax= plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(x=testing['Age'], y= testing['Survived'], ax=ax[0])
sns.scatterplot(x=testing['Fare'], y=testing['Survived'], ax=ax[1])
plt.show()

In [None]:
#Relationship between categorical data and survival
fig, ax= plt.subplots(3,2, figsize=(10,5))
sns.countplot(x=testing['Survived'], hue=testing['Sex'],  ax=ax[0,0])
sns.countplot(x=testing['Survived'], hue=testing['Pclass'], ax=ax[0,1])
sns.countplot(x=testing['Survived'], hue=testing['SibSp'],  ax=ax[1,0])
sns.countplot(x=testing['Survived'], hue=testing['Parch'], ax=ax[1,1])
sns.countplot(x=testing['Survived'], hue=testing['Embarked'], ax=ax[2,0])
plt.show()

In [None]:
#Transformin sex and Embarked into numerical data
le= LabelEncoder()
testing['Sex']= le.fit_transform(testing['Sex'])
testing['Embarked']= le.fit_transform(testing['Embarked'])
testing.info()

In [None]:
#correlation between variables
plt.figure(figsize=(20,6))
sns.heatmap(testing.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.show()

In [None]:
#splitting data set into train and test data
features=['Pclass', 'Age', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
target=['Survived']
x_train, x_test, y_train, y_test = train_test_split(testing[features], testing[target],
                                                    train_size=0.7, test_size=0.3, shuffle=False)
print("X_train: {}, X_test: {}".format(len(x_train), len(x_test)))
print("Y_train: {}, Y_test: {}".format(len(y_train), len(y_test)))

In [None]:
#Fitting the Logistic Regression Model
scaler= StandardScaler()
X_train= scaler.fit_transform(x_train)
X_test= scaler.transform(x_test)
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
#Predicting the values of Survival in test data and comparing with actual values
y_pred= model.predict(x_test)
difference=pd.DataFrame( np.c_[y_test, y_pred], columns=["Actual_Value", "Predicted_Value"])
difference[difference['Actual_Value'] == difference['Predicted_Value'] ]


In [None]:
cm=confusion_matrix(y_test, y_pred)

print("Accuracy: ", accuracy_score(y_test, y_pred))

cm

In [None]:
#Using a histogram to compare actual vs predicted results
sns.countplot( x='Survived', data=y_test, label='Actual_Value', color='red')
sns.histplot(y_pred, label='Predicted_Value', color='blue')
plt.title("Logistic Regression Model")
plt.show()