In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
         print(os.path.join(dirname, filename))


# Read Data

In [None]:
Data=pd.read_csv("titanic.csv")

In [None]:
Data.head(10)

In [None]:
Data.tail(5)

## Showing statical info

In [None]:
Data.isna().sum()

In [None]:
Data.info

In [None]:
Data.describe()

In [None]:
Data.shape

In [None]:
Data.columns

## Data cleaning

In [None]:
Data["Age"].isna().sum()

In [None]:
# fill the nan values of ages with average values in coulmn
Data["Age"]=Data["Age"].fillna(Data["Age"].mean())

In [None]:
Data["Age"].isna().sum()

In [None]:
Data=Data.drop("Cabin",axis=1)

### Catorize the coulmn based on data type

In [None]:
#obj coulmn
obj_col=[col for col in Data.columns if Data[col].dtype==object]
obj_col

In [None]:
#numeric Data
num_col=[col for col in Data.columns if Data[col].dtype in ['float32', 'float64', 'int32', 'int64']]
num_col

In [None]:
Data.duplicated

In [None]:
Data=pd.DataFrame(Data)

In [None]:
Data=Data.drop_duplicates()

In [None]:
Data.duplicated().sum()

In [None]:
Data.nunique()

## Data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# create a figure that show the relation between gender and survives or not
plt.figure(figsize=(8,8),clear=False)
sns.barplot(data=Data,x="Sex",y="Survived",palette="viridis")
font={"size":15, "weight":"bold","color":"green"}
plt.xlabel("Gender",fontdict=font)
plt.ylabel("Result of survived",fontdict=font)
plt.yticks([0,1])
plt.title("Figure that show the relation between gender and survives or not",fontdict=font)


In [None]:
# create a figure that show the relation between pclass and survives or not
plt.figure(figsize=(8,8),clear=False)
sns.barplot(data=Data,x="Pclass",y="Survived",palette="plasma")
font={"size":15, "weight":"bold","color":"green"}
plt.xlabel("Pclass",fontdict=font)
plt.ylabel("Result of survived",fontdict=font)
plt.yticks([0,1])
plt.title("Figure that show the relation between pclass and survives or not",fontdict=font)

In [None]:
# create a figure that show the relation between Embarked'and survives or not
plt.figure(figsize=(8,8),clear=False)
sns.barplot(data=Data,x="Embarked",y="Survived",palette="magma")
font={"size":15, "weight":"bold","color":"green"}
plt.xlabel("Embarked",fontdict=font)
plt.ylabel("Result of survived",fontdict=font)
plt.yticks([0,1])
plt.title("Figure that show the relation between Embarked and survives or not",fontdict=font)

#  Data preprocessing

In [None]:
# first remove unnessacry data such as name coulmn , tickt 
print(obj_col)
print(num_col)


In [None]:
col_drop=["Ticket","Name","PassengerId","Fare","Age"]
Data=Data.drop(col_drop,axis=1)

In [None]:
# convert object data into numerical data 
Data.head(10)

In [None]:
import sklearn
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
# apply label encoder on sex column 
# apply onehot on embarked column
Lb=LabelEncoder()
On=OneHotEncoder()
Data['Sex'] = Lb.fit_transform(Data['Sex'])

In [None]:
# apply one hot encoder on embarked column
column_to_encode = "Embarked"

# Instantiate the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the selected column
encoded_array = encoder.fit_transform(Data[column_to_encode].values.reshape(-1,1))

# Create a DataFrame from the encoded array with column names
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([column_to_encode]))

# Update the original DataFrame with the encoded values
Data.drop(column_to_encode, axis=1, inplace=True)  # Drop the original column
Data = pd.concat([Data, encoded_df], axis=1)  # Concatenate the encoded DataFrame with the original

# Display the updated DataFrame
Data


In [None]:
Data.tail(5)

In [None]:
# data corr
plt.figure(figsize=(8, 8))
sns.heatmap(Data.corr(), annot=True, cmap="Blues")
plt.show()

## **Model Building**

In [None]:
# import lib of algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import utils
from sklearn.metrics import accuracy_score  ,  mean_squared_error

In [None]:
# sshuffel data to protect from high varaince
Data=utils.shuffle(Data)
Data

In [None]:
Data.isna().sum()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

lg = LogisticRegression( penalty='l2', solver='liblinear',C=0.01)

X = Data.drop(["Survived"], axis=1)
target_col=["Survived"]
Y = Data[target_col]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True,
                                                    random_state=42, test_size=0.2)
X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.5,
                                                random_state=42,shuffle=True)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model_lg = lg.fit(X_train, Y_train)

predictions = model_lg.predict(X_test)
print(f"train data_shape={X_train.shape}")
print("\n")
print(f"test data_shape={X_test.shape}")
print("\n")


# Validate the model on the validation set
val_predictions = lg.predict(X_val)
val_acc = accuracy_score(Y_val, val_predictions)
print(f"Validation Accuracy: {val_acc*100:.2f}")

# Evaluate the model on the test set
test_predictions = lg.predict(X_test)
test_acc = accuracy_score(Y_test, test_predictions)
print(f"Test Accuracy: {test_acc*100:.2f}")
print("\n")
# Cross-validation
cv_mean_acc = cross_val_score(lg, X_train, Y_train,cv=2)
print(f"Cross-validation scores: {cv_mean_acc.mean() * 100:.2f}")
print(f"Mean CV accuracy: {cv_mean_acc.mean() * 100:.2f}")
print("\n")

# Additional evaluation metrics
mse = mean_squared_error(Y_test, test_predictions)
print(f"Mean Squared Error: {mse*100:.2f}")


In [None]:
# Accuracy values
accuracy_values = [val_acc*100, test_acc*100, cv_mean_acc.mean() * 100, mse*100]

# Metrics
metrics = ["Validation Accuracy", "Test Accuracy", "Cross-validation Accuracy", "Mean Squared Error"]

# Create a data frame
df = pd.DataFrame(list(zip(metrics, accuracy_values)), columns =['Metrics', 'Accuracy'])

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.set_index('Metrics'), annot=True, fmt=".2f", cmap='YlGnBu')

# Show the plot
plt.show()

In [None]:
import joblib

# Save the model as a pickle file
joblib.dump(lg, 'logistic rg_model.pkl')
