<a href="https://colab.research.google.com/github/TejashreeKarekar/DS_Projects/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

# Load data and select relevant columns
df = pd.read_csv("tested.csv")
df = df[['Pclass', 'Sex', 'Survived', 'Age']]

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Handle outliers using a function
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper_bound if x > upper_bound else (lower_bound if x < lower_bound else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using PCA
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# Initialize and train logistic regression model
logr = LogisticRegression()
logr.fit(X_train, y_train)

# Predict on test data
y_pred = logr.predict(X_test)

# Print evaluation metrics
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))



Logistic Regression Accuracy: 0.53125
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.61      0.55        76
           1       0.57      0.46      0.51        84

    accuracy                           0.53       160
   macro avg       0.54      0.53      0.53       160
weighted avg       0.54      0.53      0.53       160

Confusion Matrix:
 [[46 30]
 [45 39]]


In [3]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB

# Continue with the same dataframe `df` loaded previously

# No need to handle outliers and imbalance again
# PCA reduction is also not necessary for Naive Bayes

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# Initialize and train Naive Bayes classifier
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict on test data
y_pred = nb.predict(X_test)

# Print evaluation metrics
print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))



Naive Bayes Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.70      0.62        76
           1       0.65      0.51      0.57        84

    accuracy                           0.60       160
   macro avg       0.61      0.60      0.60       160
weighted avg       0.61      0.60      0.60       160

Confusion Matrix:
 [[53 23]
 [41 43]]


In [4]:
from sklearn.neighbors import KNeighborsClassifier

# Continue with the same dataframe `df` loaded previously

# No need to handle outliers and imbalance again
# PCA reduction is also not necessary for KNN

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# Initialize and train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict on test data
y_pred = knn.predict(X_test)

# Print evaluation metrics
print('KNN Accuracy:', accuracy_score(y_test, y_pred))


KNN Accuracy: 0.70625


In [5]:

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Continue with the same dataframe `df` loaded previously

# No need to handle outliers and imbalance again
# PCA reduction is also not necessary for Decision Tree

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# Initialize and train Decision Tree classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Predict on test data
y_pred = dt.predict(X_test)

# Print evaluation metrics
print('Decision Tree Accuracy:', accuracy_score(y_test, y_pred))

Decision Tree Accuracy: 0.89375


In [6]:
#Random Tree
from sklearn.ensemble import RandomForestClassifier

# Continue with the same dataframe `df` loaded previously

# No need to handle outliers and imbalance again
# PCA reduction is also not necessary for Random Forest

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# Initialize and train Random Forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Predict on test data
y_pred = rf.predict(X_test)

# Print evaluation metrics
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred))


Random Forest Accuracy: 0.85625


In [7]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

# Continue with the same dataframe `df` loaded previously

# No need to handle outliers and imbalance again
# PCA reduction is also not necessary for Gradient Boosting

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

# Initialize and train Gradient Boosting classifier
gbm = GradientBoostingClassifier(n_estimators=10)
gbm.fit(X_train, y_train)

# Predict on test data
y_pred = gbm.predict(X_test)

# Print evaluation metrics
print('Gradient Boosting Accuracy:', accuracy_score(y_test, y_pred))

Gradient Boosting Accuracy: 0.675
