In [1]:
#1. Logistic Regression
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.naive_bayes import GaussianNB

df = pd.read_csv('tested.csv')

df = df[['Pclass', 'Sex', 'Survived', 'Age']]

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Identify and handle outliers using Interquartile Range (IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper if x > upper else (lower if x < lower else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using Principle component analysis
pca = PCA(n_components=2)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

logr = LogisticRegression()

logr.fit(X_train, y_train)

y_pred = logr.predict(X_test)

# Print evaluation metrics
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))



Accuracy: 0.53125
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.61      0.55        76
           1       0.57      0.46      0.51        84

    accuracy                           0.53       160
   macro avg       0.54      0.53      0.53       160
weighted avg       0.54      0.53      0.53       160

Confusion Matrix:
 [[46 30]
 [45 39]]


In [2]:
#2. Naive Bayes
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

df = pd.read_csv('tested.csv')

df = df[['Pclass', 'Sex', 'Survived', 'Age']]

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Identify and handle outliers using Interquartile Range (IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper if x > upper else (lower if x < lower else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using Principle component analysis
pca = PCA(n_components=2)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)
nb=GaussianNB()
nb.fit(X_train,y_train)

y_pred1=nb.predict(X_test)

print("Naive Bayes: ",accuracy_score(y_test,y_pred1))


Naive Bayes:  0.6


In [4]:
#3. KNN
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

df = pd.read_csv('tested.csv')

df = df[['Pclass', 'Sex', 'Survived', 'Age']]

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Identify and handle outliers using Interquartile Range (IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper if x > upper else (lower if x < lower else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using PCA
pca = PCA(n_components=2)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

# Print evaluation metrics
print('Accuracy:', accuracy_score(y_test, y_pred))

Accuracy: 0.70625


In [5]:
#4. Decision Tree

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

dt=tree.DecisionTreeClassifier()
df = pd.read_csv('tested.csv')

df = df[['Pclass', 'Sex', 'Survived', 'Age']]

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Identify and handle outliers using Interquartile Range (IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper if x > upper else (lower if x < lower else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using Principle component analysis
pca = PCA(n_components=2)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

train=dt.fit(X_train,y_train)

y_pred=dt.predict(X_test)

print(accuracy_score(y_test,y_pred))


0.9


In [6]:
# 5. Random Forest

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
df = pd.read_csv('tested.csv')

df = df[['Pclass', 'Sex', 'Survived', 'Age']]

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Identify and handle outliers using Interquartile Range (IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper if x > upper else (lower if x < lower else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using Principle component analysis
pca = PCA(n_components=2)
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)
train=rf.fit(X_train,y_train)

y_pred=rf.predict(X_test)

print(accuracy_score(y_test,y_pred))

0.85


In [7]:
#6. GRADIENT BOOSTING
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

df = pd.read_csv('tested.csv')

df = df[['Pclass', 'Sex', 'Survived', 'Age']]

df['Age'].fillna(df['Age'].median(), inplace=True)

# Convert categorical variables into numeric variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# Feature selection
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']

# Identify and handle outliers using Interquartile Range (IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[column] = df[column].apply(lambda x: upper if x > upper else (lower if x < lower else x))

handle_outliers(df, 'Age')

# Handle data imbalance using SMOTE
smote = SMOTE(random_state=0)
X, y = smote.fit_resample(X, y)

# Dimensionality reduction using PCA
pca = PCA(n_components=2)
X = pca.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3)

gbm = GradientBoostingClassifier(n_estimators=10)

gbm.fit(X_train, y_train)

y_pred = gbm.predict(X_test)

# Print evaluation metrics
print("GBM Accuracy:", accuracy_score(y_test, y_pred))

GBM Accuracy: 0.675
