In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [20]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

In [22]:
def improved_preprocess(df):

    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Mlle','Ms'],'Miss')
    df['Title'] = df['Title'].replace(['Mme'],'Mrs')
    rare_titles = df['Title'].value_counts()[df['Title'].value_counts() < 10].index
    df['Title'] = df['Title'].replace(rare_titles, 'Rare')


    df['Cabin'] = df['Cabin'].astype(str).str[0]
    df['Cabin'] = df['Cabin'].replace("n","U")


    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)


    df = df.drop(['PassengerId','Name','Ticket'], axis=1, errors='ignore')
    return df

train_imp = improved_preprocess(train.copy())
X_imp = train_imp.drop("Survived", axis=1)
y_imp = train_imp["Survived"]


numeric_features = X_imp.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_imp.select_dtypes(exclude=[np.number]).columns.tolist()


preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", KNNImputer(n_neighbors=3)),
                          ("scaler", StandardScaler())]), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


models = {
    "Support Vector Machines": SVC(),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB(),
    "Perceptron": Perceptron(max_iter=1000),
    "Stochastic Gradient Decent": SGDClassifier(max_iter=1000, tol=1e-3),
    "Linear SVC": LinearSVC(max_iter=2000),
    "Decision Tree": DecisionTreeClassifier()
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("\nQ2: Improved Preprocessing Accuracies")
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", model)])
    scores = cross_val_score(pipe, X_imp, y_imp, cv=cv)
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)



Q2: Improved Preprocessing Accuracies
Support Vector Machines: 0.8328 ± 0.0105
KNN: 0.8126 ± 0.0223
Logistic Regression: 0.8294 ± 0.0128
Random Forest: 0.8114 ± 0.0248
Naive Bayes: 0.7811 ± 0.0396
Perceptron: 0.7834 ± 0.0198
Stochastic Gradient Decent: 0.7811 ± 0.0178
Linear SVC: 0.8283 ± 0.0122
Decision Tree: 0.7710 ± 0.0331
