In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
%matplotlib inline

In [2]:
#importing test data and test data, random seed = 0 , so that output always remain same
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
np.random.seed=0


In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [6]:
# didn't use outlier, as prediction was overfitting
# outlier function : find outleir based on more than 2 features have outlier simultaneously ,of the data
from collections import Counter
def detect_outliers(df,columns):
    outliers = []
    for c in columns:

        #first and third quartile
        Q1 = np.percentile(df[c],25)
        Q3 = np.percentile(df[c],75)
        #IQR
        IQR = Q3-Q1
        #detect outlier
        detect = IQR * 1.5
        filter1 = df[c]<Q1-detect
        filter2 = df[c]>Q3+detect
        outliers.extend( df[filter1 | filter2].index)
    outliers = Counter(outliers)
    multipleoutliers = list(i for i , v in outliers.items() if v>2)
    return multipleoutliers

In [None]:
# to check for overfitting for certain features
train_df.loc[detect_outliers(train_df,["Age","SibSp","Parch","Fare"]),:]

In [None]:
# to remove outliers
train_df = train_df.drop(detect_outliers(train_df,["Age","SibSp","Parch","Fare"]),axis =0).reset_index(drop = True)

In [None]:
plt.scatter(x = train_df.Age, y = train_df.SibSp, s=5, alpha = 0.2)
plt.xlabel("Age")
plt.ylabel("Count of siblings and spouses")
plt.show()
plt.clf()
plt.scatter(x = train_df.Age, y = train_df.Pclass, s=5, alpha = 0.2)
plt.xlabel("Age")
plt.ylabel("Passenger Class")
plt.show()
plt.clf

In [None]:
# to observe null percentage of features for the data
null_value_percentages=(train_df.isna().sum()/train_df.shape[0])*100
null_value_percentages

In [10]:
# to remove column 'Cabin' as it has maximum null prercentage
features_to_drop = null_value_percentages[null_value_percentages > 74].index
df = train_df.drop(columns=features_to_drop)

In [None]:
# observe null percentage after removal
null_value_percentages=(df.isna().sum()/train_df.shape[0])*100
null_value_percentages

In [None]:
df.describe()

In [12]:
# to remove non numerical and non related features for Survived column
df.drop(columns=['Name','Ticket'], inplace=True)
test_df.drop(columns=['Name','Ticket','Cabin'], inplace=True)

In [None]:
df.describe()

In [14]:
# to observe does Embarked column  has any null value
print(df['Embarked'].isna().sum())
print(test_df['Embarked'].isna().sum())

0

In [None]:
# since 'Embarked' has only 2 null values therefore replacing it with variable occuring which is 'S
df['Embarked'].value_counts()
test_df['Embarked'].value_counts()

In [None]:
df['Embarked'].fillna('S', inplace=True)

In [None]:
test_df.head(10)

In [None]:
df.describe()

In [17]:
# used to transform feature data into integer using LabelEncoder

from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
unique_op_sex = df['Sex'].unique()
print("Label Encoder Parameters Sex", le.inverse_transform(unique_op_sex))

In [None]:
le = LabelEncoder()
test_df['Sex'] = le.fit_transform(test_df['Sex'])
unique_op_sex = test_df['Sex'].unique()
print("Label Encoder Parameters Sex", le.inverse_transform(unique_op_sex))

In [None]:
df['Embarked'] = le.fit_transform(df['Embarked'])
unique_op_embarked = df['Embarked'].unique()
print("Label Encoder Parameters Sex", le.inverse_transform(unique_op_embarked))

In [None]:
test_df['Embarked'] = le.fit_transform(test_df['Embarked'])
unique_op_embarked = test_df['Embarked'].unique()
print("Label Encoder Parameters Sex", le.inverse_transform(unique_op_embarked))

In [None]:
df.head(10)

In [None]:
sns.clustermap(df.corr(), cmap="rocket_r")

In [None]:
# to observe any abormality , therefore observing it through graphs
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and 'feature' is the column name
plt.figure(figsize=(8, 6))  # Optional, to set the figure size
plt.boxplot(df['Age'], vert=False)  # vert=False for a horizontal plot
plt.title('Box plot of feature')
plt.xlabel('Feature')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and 'feature' is the column name
plt.figure(figsize=(8, 6))  # Optional, to set the figure size
plt.boxplot(df['Embarked'], vert=False)  # vert=False for a horizontal plot
plt.title('Box plot of feature')
plt.xlabel('Feature')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and 'feature' is the column name
plt.figure(figsize=(8, 6))  # Optional, to set the figure size
plt.boxplot(df['Parch'], vert=False)  # vert=False for a horizontal plot
plt.title('Box plot of feature')
plt.xlabel('Feature')
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and 'feature' is the column name
plt.figure(figsize=(8, 6))  # Optional, to set the figure size
plt.boxplot(df['Fare'], vert=False)  # vert=False for a horizontal plot
plt.title('Box plot of feature')
plt.xlabel('Feature')
plt.show()

In [None]:
test_df.head(10)

In [22]:
# Using KNNImputer to assign values to NULL values of "Age" column using other features
from sklearn.impute import KNNImputer

In [None]:
# to observe any correlation for age feature with other features
sns.histplot(x = df.Age, bins = 80)
plt.title("Age Histogram Before Filling Missing")
plt.show()
plt.clf()

# Find what other features might provide useful ways to estimate age
all_correlations = pd.DataFrame(df.corr(numeric_only = True))
print(all_correlations['Age'])

In [None]:
df.info()


In [23]:
imputer = KNNImputer(n_neighbors=5)
df.loc[:, 'Pclass':] = imputer.fit_transform(df.loc[:, 'Pclass':])

In [24]:
imputer = KNNImputer(n_neighbors=5)
test_df.loc[:, 'Pclass':] = imputer.fit_transform(test_df.loc[:, 'Pclass':])

In [None]:
# creating buckets for age and fare to optimize the model , and it was observed that due to fare column , the model accuracy decreases , therefore breaking down it into category
def create_age_group(df):
    age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7}
    df['AgeGroup'] = pd.cut(df['Age'],
                            bins=[-float('inf'), 5, 14, 18, 25, 30, 60, float('inf')],
                            labels=['Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior'])
    df['AgeGroup'] = df['AgeGroup'].map(age_mapping)
    df.drop(['Age'], axis=1, inplace=True)

    return df
create_age_group(df)
df.head()

In [None]:
create_age_group(test_df)

In [None]:
test_df['FareBand'] = pd.qcut(test_df['Fare'], 4, labels = [1, 2, 3, 4])
test_df.drop(['Fare'], axis=1, inplace=True)
test_df.head()

In [None]:
df['FareBand'] = pd.qcut(df['Fare'], 4, labels = [1, 2, 3, 4])
df.drop(['Fare'], axis=1, inplace=True)
df.head()

In [None]:
# to observe training data has only int values
# if not converting it into int using Categorial 
df.info()

In [29]:
df['AgeGroup'] = pd.Categorical(df['AgeGroup']).codes
df['FareBand'] = pd.Categorical(df['FareBand']).codes

In [30]:
test_df['AgeGroup'] = pd.Categorical(test_df['AgeGroup']).codes
test_df['FareBand'] = pd.Categorical(test_df['FareBand']).codes

In [None]:
test_df.describe()

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
train, test = train_test_split(df, test_size=0.2, random_state=67, stratify=df.loc[:, 'Survived'])

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [34]:
dt = DecisionTreeClassifier(random_state=67)
dt_params = {
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid.fit(train.loc[:, 'Pclass':],train.loc[:, 'Survived'])

best_dt = dt_grid.best_estimator_
dt_pred = best_dt.predict((test.loc[:, 'Pclass':]))
dt_mse = accuracy_score(test.loc[:, 'Survived'], dt_pred)
print(f"Decision Tree Best MSE: {dt_mse}")

Decision Tree Best MSE: 0.8251748251748252


In [None]:
best_dt

In [None]:

rf = RandomForestClassifier(random_state=67)
rf_params = {
    'n_estimators': [290,300,320],
    'max_depth': [16,17,18,19],
    'min_samples_split': [8,9, 10,11,12],
    'min_samples_leaf': [1,2,3,4],
    'criterion' : ['gini','entropy']
}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(train.loc[:, 'Pclass':],train.loc[:, 'Survived'])

best_rf = rf_grid.best_estimator_
rf_pred = best_rf.predict((test.loc[:, 'Pclass':]))
rf_mse = accuracy_score(test.loc[:, 'Survived'], rf_pred)
print(f"Random Forest Best MSE: {rf_mse}")

In [None]:

xgboost = xgb.XGBClassifier(objective='binary:logistic', random_state=67, eval_metric='logloss')
xgboost_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3,4,5,6,7, 10, 15],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [ 0.8, 1],
}

xgb_grid = GridSearchCV(xgboost, xgboost_params, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(train.loc[:, 'Pclass':],train.loc[:, 'Survived'])

best_xgb = xgb_grid.best_estimator_
xgb_pred = best_xgb.predict((test.loc[:, 'Pclass':]))
xgb_mse = accuracy_score(test.loc[:, 'Survived'], xgb_pred)
print(f"XGBoost Best MSE: {xgb_mse}")


In [37]:
DecisionTree_accuracy = accuracy_score(train.loc[:, 'Survived'], best_dt.predict(train.loc[:, 'Pclass':]))
print(f"DecisionTree Train  Best Accuracy: {DecisionTree_accuracy}")
DecisionTree_accuracy = accuracy_score(test.loc[:, 'Survived'], dt_pred)
print(f"DecisionTree Test  Best Accuracy: {DecisionTree_accuracy}")

rf_accuracy = accuracy_score(train.loc[:, 'Survived'], best_rf.predict(train.loc[:, 'Pclass':]))
print(f"Rf Train  Best Accuracy: {rf_accuracy}")
rf_accuracy = accuracy_score(test.loc[:, 'Survived'], rf_pred)
print(f"Rf Test  Best Accuracy: {rf_accuracy}")

xgb_accuracy = accuracy_score(train.loc[:, 'Survived'], best_xgb.predict(train.loc[:, 'Pclass':]))
print(f"XGBoost Train  Best Accuracy: {xgb_accuracy}")
xgb_accuracy = accuracy_score(test.loc[:, 'Survived'], xgb_pred)
print(f"XGBoost Test  Best Accuracy: {xgb_accuracy}")

DecisionTree Train  Best Accuracy: 0.8260105448154658
DecisionTree Test  Best Accuracy: 0.8251748251748252
Rf Train  Best Accuracy: 0.8629173989455184
Rf Test  Best Accuracy: 0.8181818181818182
XGBoost Train  Best Accuracy: 0.8418277680140598
XGBoost Test  Best Accuracy: 0.8251748251748252


In [40]:
# since Xg boost causes overfitting and tried in competition also therefore , using RandomForest since it test better for trainig data also
# Rf Train  Best Accuracy: 0.8629173989455184
# Rf Test  Best Accuracy: 0.8181818181818182
print(best_rf.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 18, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': 67, 'verbose': 0, 'warm_start': False}


In [39]:
# testing the model with test data
test_df
label_add = test_df['PassengerId']
poly_pred_test = best_rf.predict((test_df.loc[:, 'Pclass':]))
result = np.column_stack((label_add, poly_pred_test))
result
result = pd.DataFrame(result, columns=['PassengerId', 'Survived'])
result['PassengerId'] = result['PassengerId'].astype("int")
result = result.reset_index(drop=True)
result.to_csv('output.csv', index=False)