<a href="https://colab.research.google.com/github/Rohith-Potana/CodSoft-Internship/blob/main/CodSoft2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [76]:
# Load the dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [77]:
# Drop unnecessary columns
data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Fill missing values
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Convert categorical columns to numeric
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

# Verify the changes
print(data.isnull().sum())
print(data['Sex'].value_counts())
print(data[['Embarked_Q', 'Embarked_S']].head())
survival_rate_by_sex = data.groupby('Sex')['Survived'].mean().reset_index()
print(survival_rate_by_sex)

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_Q    0
Embarked_S    0
dtype: int64
Sex
0    577
1    314
Name: count, dtype: int64
   Embarked_Q  Embarked_S
0       False        True
1       False       False
2       False        True
3       False        True
4       False        True
   Sex  Survived
0    0  0.188908
1    1  0.742038


In [78]:
# Visualization of survival rate by sex using Plotly
fig = px.bar(survival_rate_by_sex, x='Sex', y='Survived', title='Survival Rate by Sex')
fig.update_xaxes(tickvals=[0, 1], ticktext=['Male', 'Female'])
fig.show()

In [79]:
# Define the feature matrix (X) and target vector (y)
X = data.drop(columns='Survived')
y = data['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
print(f"Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}")

Training set shape: (712, 8), Testing set shape: (179, 8)


In [80]:
# Train a logistic regression model with a higher maximum iteration limit
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Display the coefficients of the model
print(f"Model coefficients: {model.coef_}")

Model coefficients: [[-9.42755591e-01  2.59141782e+00 -3.14242334e-02 -2.97050457e-01
  -1.12156016e-01  2.53607566e-03 -9.07243309e-02 -4.03896519e-01]]


In [81]:
# Predict the survival on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.8100558659217877
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179

Confusion Matrix:
[[90 15]
 [19 55]]
