In [59]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,StandardScaler


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder,  MinMaxScaler


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [60]:
#I Data analysis

In [61]:
#1 Loading Dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
#2 Identifying Duplicates
print(f"Duplicates in Train Dataset is:{train_df.duplicated().sum()}")
print(f"Duplicates in Test Dataset is:{test_df.duplicated().sum()}")

In [None]:
#3 Checking data types 
print("Data Types of features of Training Data is:")
print(train_df.dtypes)
print("\nData types of features of Testing Data is:")
print(test_df.dtypes)

In [None]:
#4 Looking for missing values
df1 = (train_df.isnull().sum()[train_df.isnull().sum()>0]).to_frame().rename(columns={0:"Number of Missing values"})
df1["% of Missing Values"] = round((100*train_df.isnull().sum()[train_df.isnull().sum()>0]/len(train_df)),2)
df1

In [None]:
df2 = (test_df.isnull().sum()[test_df.isnull().sum()>0]).to_frame().rename(columns={0:"Number of Missing values"})
df2["% of Missing Values"] = round((100*test_df.isnull().sum()[test_df.isnull().sum()>0]/len(test_df)),2).values
df2

In [None]:
#5 Visualizing: age
plt.figure(figsize=(16,6))
sns.histplot(x=train_df["Age"],hue="Transported",data=train_df,palette="Set1")
plt.title("Age Feature Distribution");

In [None]:
#6 Visualizing: RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
exp_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

plt.figure(figsize=(14,10))
for idx,column in enumerate(exp_cols):
    plt.subplot(3,2,idx+1)
    sns.histplot(x=column, hue="Transported", data=train_df,bins=30,palette="Set1")
    plt.title(f"{column} Distribution")
    plt.ylim(0,100)
    plt.tight_layout()

In [None]:
#7 Visualizing(categrical): RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
cat_cols = ["HomePlanet","CryoSleep","Destination","VIP"]

plt.figure(figsize=(12,20))
for idx,column in enumerate(cat_cols):
    plt.subplot(4,1,idx+1)
    sns.countplot(x=column, hue="Transported", data=train_df, palette="Set1")
    plt.title(f"{column} Distribution")
    plt.tight_layout()

In [69]:
#II Feature Extraction

In [70]:
#1 PassengerId--> Group_Size, Travelling_Solo
def passengerid_new_features(df):
    
    #Splitting "PassengerId" column.
    df["Group"] = df["PassengerId"].apply(lambda x: x.split("_")[0])
    df["Member"] =df["PassengerId"].apply(lambda x: x.split("_")[1])
    
    #Grouping the "Group" feature wrt "member" feature to check which group is travelling with how many members
    x = df.groupby("Group")["Member"].count().sort_values()
    
    #set of group values with more than 1 members.
    y = set(x[x>1].index)
    
    # New feature "Solo" , indicates whether the person is travelling solo or not.
    df["Travelling_Solo"] = df["Group"].apply(lambda x: x not in y)
    
    # New feature "Group_size" which will indicate each group's number of members.
    df["Group_Size"]=0
    for i in x.items():
        df.loc[df["Group"]==i[0],"Group_Size"]=i[1]

In [71]:
passengerid_new_features(train_df)
passengerid_new_features(test_df)

In [72]:
# Dropping Group & Member feature
train_df.drop(columns=["Group","Member"],inplace=True)
test_df.drop(columns=["Group","Member"],inplace=True)

In [None]:
#Visualizing: Group_Size, Travelling_Solo 
plt.figure(figsize=(15,6))

plt.subplot(1,2,1)
sns.countplot(x="Group_Size", hue="Transported", data=train_df,palette="Set1")
plt.title("Group_Size vs Transported")

plt.subplot(1,2,2)
sns.countplot(x="Travelling_Solo", hue="Transported", data=train_df,palette="Set1")
plt.title("Travelling Solo vs Transported")
plt.tight_layout()
plt.show()

In [74]:
#2 Cabin-->Cabin_Deck, Cabin_Number, Cabin_Side 
def cabin_new_feature(df):
    # Ensure all values in Cabin are strings
    df["Cabin"] = df["Cabin"].fillna("np.nan/np.nan/np.nan").astype(str)
    
    #Handling NaN values while splitting
    df["Cabin"].fillna("np.nan/np.nan/np.nan")  
    
    df["Cabin_Deck"] = df["Cabin"].apply(lambda x: x.split("/")[0])
    df["Cabin_Number"]  = df["Cabin"].apply(lambda x: x.split("/")[1])
    df["Cabin_Side"] = df["Cabin"].apply(lambda x: x.split("/")[2])
    
    #Replacing string nan values to numpy nan values.
    cols = ["Cabin_Deck","Cabin_Number","Cabin_Side"]
    df[cols]=df[cols].replace("np.nan",np.nan)

    # Convert Cabin_Number to numeric
    df["Cabin_Number"] = pd.to_numeric(df["Cabin_Number"], errors="coerce")
    
    #Filling Missing Values in new features created.
    df["Cabin_Deck"].fillna(df["Cabin_Deck"].mode()[0])
    df["Cabin_Side"].fillna(df["Cabin_Side"].mode()[0])
    df["Cabin_Number"].fillna(df["Cabin_Number"].median())

In [75]:
cabin_new_feature(train_df)
cabin_new_feature(test_df)

In [None]:
#Visualizing: Cabin_Deck, Cabin_Side
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
sns.countplot(x="Cabin_Deck",hue="Transported", data=train_df, palette="Set1")
plt.title("Cabin_Deck Distribution")

plt.subplot(1,2,2)
sns.countplot(x="Cabin_Side", hue="Transported", data=train_df, palette="Set1")
plt.title("Cabin_Side Distribution")
plt.tight_layout()
plt.show()

In [77]:
#REVISIT 1 ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
# Fill missing values with the median before converting to integers
train_df["Cabin_Number"] = train_df["Cabin_Number"].fillna(train_df["Cabin_Number"].median()).astype(int)
test_df["Cabin_Number"] = test_df["Cabin_Number"].fillna(test_df["Cabin_Number"].median()).astype(int)
# NaN type casting error


# Convert Cabin_Number to integer type
train_df["Cabin_Number"]=train_df["Cabin_Number"].astype(int)
test_df["Cabin_Number"]=test_df["Cabin_Number"].astype(int)

In [None]:
# some analysis on Cabin_Number
print("Total Unique values present in Cabin_Number feature is:",train_df["Cabin_Number"].nunique())
print("The Mean of Cabin_Number Feature is: ",train_df["Cabin_Number"].mean())
print("The Median of Cabin_Number Feature is:",train_df["Cabin_Number"].median())
print("The Minimum value of Cabin_Number feature is:",train_df["Cabin_Number"].min())
print("The Maximum value of Cabin_number Feature is:",train_df["Cabin_Number"].max())

In [None]:
#Visualizing: Cabin_Number
plt.figure(figsize=(15,5))
sns.histplot(x="Cabin_Number",data=train_df,hue="Transported",palette="Set1")
plt.title("Cabin_Number Distribution")
plt.xticks(list(range(0,1900,300)))
plt.vlines(300,ymin=0,ymax=550,color="black")
plt.vlines(600,ymin=0,ymax=550,color="black")
plt.vlines(900,ymin=0,ymax=550,color="black")
plt.vlines(1200,ymin=0,ymax=550,color="black")
plt.vlines(1500,ymin=0,ymax=550,color="black")
plt.show()

In [80]:
#3 Cabin_Number-->Cabin_Regions
def cabin_regions(df):
    df["Cabin_Region1"] = (df["Cabin_Number"]<300)
    df["Cabin_Region2"] = (df["Cabin_Number"]>=300) & (df["Cabin_Number"]<600)
    df["Cabin_Region3"] = (df["Cabin_Number"]>=600) & (df["Cabin_Number"]<900)
    df["Cabin_Region4"] = (df["Cabin_Number"]>=900) & (df["Cabin_Number"]<1200)
    df["Cabin_Region5"] = (df["Cabin_Number"]>=1200) & (df["Cabin_Number"]<1500)
    df["Cabin_Region6"] = (df["Cabin_Number"]>=1500)

In [81]:
cabin_regions(train_df)
cabin_regions(test_df)

In [82]:
# Dropping Cabin_Number Feature
train_df.drop(columns=["Cabin_Number"],inplace=True)
test_df.drop(columns=["Cabin_Number"],inplace=True)

In [None]:
#Visualizing: Cabin_Regions
cols = ["Cabin_Region1","Cabin_Region2","Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6"]

plt.figure(figsize=(20,25))
for idx,value in enumerate(cols):
    plt.subplot(4,2,idx+1)
    sns.countplot(x=value, hue="Transported", data=train_df, palette="Set2")
    plt.title(f"{value} Distribution")
    plt.tight_layout()

In [84]:
#4 Age-->Age_Group
def age_group(df):
    age_group  = []
    for i in df["Age"]:
        if i<=12:
            age_group.append("Age_0-12")
        elif (i>12 and i<=18):
            age_group.append("Age_13-18")
        elif (i>18 and i<=25):
            age_group.append("Age_19-25")
        elif (i>25 and i<=32):
            age_group.append("Age_26-32")
        elif (i>32 and i<=50):
            age_group.append("Age_33_50")
        elif (i>50):
            age_group.append("age_50+")
        else:
            age_group.append(np.nan)
        
    df["Age Group"] = age_group

In [85]:
age_group(train_df)
age_group(test_df)

In [None]:
#Visualizing: Age_Group
order = sorted(train_df["Age Group"].value_counts().keys().to_list())

plt.figure(figsize=(14,6))
sns.countplot(x="Age Group",hue="Transported", data=train_df, palette="Set2",order=order)
plt.title("Age Group Distribution")

In [87]:
#5 RoomService, FoodCourt, ShoppingMall, Spa, VRDeck --> Total Expenditure
exp_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

def new_exp_features(df):
    df["Total Expenditure"] = df[exp_cols].sum(axis=1)
    df["No Spending"] = (df["Total Expenditure"]==0)

In [88]:
new_exp_features(train_df)
new_exp_features(test_df)

In [None]:
#Visualizing: Total Expenditure
plt.figure(figsize=(15,6))
sns.histplot(x="Total Expenditure", hue="Transported", data=train_df, palette="Set1",bins=200)
plt.ylim(0,200)
plt.xlim(0,10000)
plt.title("Total Expenditure Distribution")

In [None]:
# some analysis on Total Expenditure
mean = round(train_df["Total Expenditure"].mean())
median = train_df["Total Expenditure"].median()

print("Mean value of Total Expenditure feature is = ",mean)
print("Median value of Total Expenditure feature is = ",median)

In [91]:
#6 Total Expenditure --> Expenditure Category
def expenditure_category(df):
    expense_category = []
    
    for i in df["Total Expenditure"]:
        if i==0:
            expense_category.append("No Expense")
        elif (i>0 and i<=716):
            expense_category.append("Low Expense")
        elif (i>716 and i<=1441):
            expense_category.append("Medium Expense")
        elif (i>1441):
            expense_category.append("High Expense")
    
    df["Expenditure Category"] = expense_category

In [92]:
expenditure_category(train_df)
expenditure_category(test_df)

In [None]:
#Visualizing: No Spending, Expenditure Category
cols = ["No Spending", "Expenditure Category"]

plt.figure(figsize=(18,6))
for idx,column in enumerate(cols):
    plt.subplot(1,2,idx+1)
    sns.countplot(x=column, hue="Transported", data=train_df, palette="Set2")
    plt.title(f"{column} Distribution")
    plt.tight_layout()

In [94]:
#III Missing values

In [None]:
train_df.describe().T

In [96]:
#pass_df = test_df[["PassengerId"]]

In [97]:
cols = ["PassengerId","Cabin","Name"]
train_df.drop(columns =cols, inplace=True)
test_df.drop(columns=cols, inplace=True)

In [98]:
cat_cols = train_df.select_dtypes(include=["object","bool"]).columns.tolist()
cat_cols.remove("Transported")
num_cols = train_df.select_dtypes(include=["int","float"]).columns.tolist()

In [None]:
print("Categorical Columns:",cat_cols)
print("\nNumerical Columns:",num_cols)

In [100]:
def fill_missingno(df):
    imputer1 = SimpleImputer(strategy="most_frequent")     ##To fill Categorical Features.
    imputer2 = SimpleImputer(strategy="median")            ##To fill numeircal features.
    df[cat_cols] = imputer1.fit_transform(df[cat_cols])
    df[num_cols] = imputer2.fit_transform(df[num_cols])

In [101]:
fill_missingno(train_df)
fill_missingno(test_df)

In [None]:
# Display the number of missing values for each column
missing_values = train_df.isnull().sum()
print("Number of missing values in each column:")
print(missing_values)

In [None]:
#Checking for duplicate values
print("Duplicate values in training data is: ",train_df.duplicated())
print("Duplicate values in training data is: ",train_df.duplicated().sum())
print("Duplicate values in testing data is: ",test_df.duplicated().sum())

In [None]:
train_df.dtypes

In [105]:
cols = ["CryoSleep","VIP","Travelling_Solo","No Spending","Cabin_Region1","Cabin_Region2","Cabin_Region3","Cabin_Region4",
       "Cabin_Region5","Cabin_Region6"]

train_df[cols] = train_df[cols].astype(bool)
test_df[cols] = test_df[cols].astype(bool)

In [106]:
nominal_cat_cols = ["HomePlanet","Destination"]
ordinal_cat_cols = ["CryoSleep","VIP","Travelling_Solo","Cabin_Deck","Cabin_Side","Cabin_Region1","Cabin_Region2",
                    "Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6","Age Group","No Spending",
                    "Expenditure Category"]

In [107]:
#IV Feature Encoding

In [108]:
enc = LabelEncoder()

In [109]:
train_df[ordinal_cat_cols] = train_df[ordinal_cat_cols].apply(enc.fit_transform)
test_df[ordinal_cat_cols] = test_df[ordinal_cat_cols].apply(enc.fit_transform)

In [110]:
train_df = pd.get_dummies(train_df,columns=nominal_cat_cols)
test_df = pd.get_dummies(test_df,columns=nominal_cat_cols)

In [None]:
train_df["Transported"] = train_df["Transported"].replace({False: 0, True: 1}).astype(int)


In [None]:
train_df.head()

In [None]:
test_df.head()

In [115]:
X = train_df.drop(columns=["Transported"])
y = train_df[["Transported"]]

In [116]:
#V Feature scaling

In [117]:
scaler = StandardScaler()

In [118]:
X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.fit_transform(test_df)

In [119]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
print(x_train.shape, y_train.shape)
print(x_test.shape,y_test.shape)

In [121]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(X_scaled,y,test_size=0.2,random_state=0)

In [None]:
#VI Model Selection


#Hyperparameter tuning, 

In [122]:
training_score = []
testing_score = []
def model_prediction(model):
    model.fit(x_train1,y_train1)
    x_train_pred1 = model.predict(x_train1)
    x_test_pred1 = model.predict(x_test1)
    a = accuracy_score(y_train1,x_train_pred1)*100
    b = accuracy_score(y_test1,x_test_pred1)*100
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)

In [None]:
model_prediction(LogisticRegression())

In [None]:
model_prediction(KNeighborsClassifier())

In [125]:
def model_prediction(model):
    model.fit(x_train,y_train)
    x_train_pred = model.predict(x_train)
    x_test_pred = model.predict(x_test)
    a = accuracy_score(y_train,x_train_pred)*100
    b = accuracy_score(y_test,x_test_pred)*100
    training_score.append(a)
    testing_score.append(b)
    
    print(f"Accuracy_Score of {model} model on Training Data is:",a)
    print(f"Accuracy_Score of {model} model on Testing Data is:",b)

In [None]:
model_prediction(DecisionTreeClassifier())

In [None]:
model_prediction(RandomForestClassifier())

In [None]:
model_prediction(XGBClassifier())

In [132]:
models = ["Logistic Regression","KNN","Decision Tree","Random Forest","XGBoost"]

In [None]:
df = pd.DataFrame({"Algorithms":models,
                   "Training Score":training_score,
                   "Testing Score":testing_score})
df

In [None]:
df.plot(x="Algorithms",y=["Training Score","Testing Score"], figsize=(16,6),kind="bar",
        title="Performance Visualization of Different Models",colormap="Set1")
plt.show()