# Importing Libraries 

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px # for visualization



# Information about Data set 

**PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

**HomePlane**t - The planet the passenger departed from, typically their planet of permanent residence.

**CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

**Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

**Destination** - The planet the passenger will be debarking to.

**Age** - The age of the passenger.

**VIP** - Whether the passenger has paid for special VIP service during the voyage.

**RoomService**, **FoodCourt**, **ShoppingMall**, **Spa**, **VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

**Name**- The first and last names of the passenger.

**Transported**- Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.




In [None]:
# Observation Mention in these are only my knowledge of understanding.
#you can point out more or can give give advice on what I have wrong
#give upvote as its gives me confidence on working harder and better

# Loading and preview of our dataset

In [None]:
titanic=pd.read_csv("../input/spaceship-titanic/train.csv")
titanic_test=pd.read_csv("../input/spaceship-titanic/test.csv")
titanic.head(10)

In [None]:
titanic.info()

# Cardinalaity In our columns(1)

In [None]:
titanic.nunique().plot.bar(title="Cardinality in our columns")

## Duplicates

In [None]:
print(f'Duplicates in train set: {titanic.duplicated().sum()}, ({np.round(100*titanic.duplicated().sum()/len(titanic),1)}%)')
print('')
print(f'Duplicates in test set: {titanic_test.duplicated().sum()}, ({np.round(100*titanic_test.duplicated().sum()/len(titanic_test),1)}%)')

## Observations
* 8693 rows(obervations) and 14 columns(13 features + 1 target varible)
* data types -int,float,obj
* Notice - we have some feature which have multiple informations in one column
  and this are also have the greatest cardinality
* we also some missing values
* we haveBinary columns- Transported,CryoSleep,VIP
* high cardinality in columns

# Handling Mixed Varibles

### Our passengerId has two information in 1 column we need to separate them

In [None]:
#here we are seperation passenger group info and the no. passenger has in that group.
# we created dataframe with two columns and assigned the values in orignal dataframe
titanic[["Passenger_group","Passennger_no._in_group"]]=titanic["PassengerId"].str.split("_",expand=True)

#dropinng passengerId as we have seperated the information in two column we dont need it..
titanic.drop(columns=["PassengerId"],inplace=True)
titanic.head()

In [None]:
# logic is same for below features as well

### In cabin column we have three information in one column. lets seperate them

In [None]:
titanic[["Cabin_deck","Cabin_no.","Cabin_side"]]=titanic["Cabin"].str.split("/",expand=True)
titanic.drop(columns=["Cabin"],inplace=True)
titanic.head()

###  Name column also has two information in it

In [None]:
titanic[["firstname","last_name"]]=titanic["Name"].str.split(" ",expand=True)
titanic.drop(columns=["Name"],inplace=True)
titanic.head()


In [None]:
titanic.nunique() #unique values in columns

# Cardinalaity In our columns(2)

In [None]:
titanic.nunique().plot.bar(title="Cardinality in our columns");

## Observation
 * Notice - how the cardinality has decreased in some columns
 * our columns has been change . we can better work with this ones

# Quantifying missing values

In [None]:


missing=(titanic.isnull().mean().sort_values(ascending=False)*100).reset_index()
missing.rename(columns={0:"Average"},inplace=True)
missing.head()

fig=px.histogram(missing,x="Average",y="index",title="<b>% of Missing values",color="index",labels={"Average":"%age of missing values","index":"Column Names"})
fig.update_layout(
    font_color="white",
    font_size=12,
    title_font_color="cyan",
    legend_title_font_color="white",
    legend_title_font_size=20,
    template="plotly_dark",
    title_font_size=30
    
)
fig.update_layout(xaxis_title = "<b>Amount in %age",xaxis_title_font_size=20,
                  yaxis_title="<b>Column-Name",yaxis_title_font_size=20,title_x=0.5)
fig.show()
fig=px.imshow(titanic.isnull().T,color_continuous_scale=px.colors.sequential.Electric,title="<b>Missing values in our data",)
fig.update_layout(template="plotly_dark",title_font_size=30,title_x=0.5)


fig.show()

# Observation
* **Not a lot of data is missing in particular features**
* **all features has missing values in the range 2-2.5%** (except Passenger group and no. in group -no missing values)
* **our target feature has no missing values int it**

In [None]:
# lets check what happens when we drop missing values

(titanic.dropna().shape[0]/titanic.shape[0]) *100

# so after dropping missing values we are left with 76% data 
# thats lot of data to drop
# we will not drop instead i thnik we should impute those values

# Handling binary features (Transported,Cryosleep,VIP)

In [None]:
titanic["Transported"]=np.where(titanic["Transported"]==True,1,0) # replacing true-1,false-0


# ignore them for now
# titanic["CryoSleep"]=np.where(titanic["CryoSleep"]==True,1,np.where(titanic["CryoSleep"]==False,0,titanic["CryoSleep"])) # replacing true-1,false-0

# titanic["VIP"]=np.where(titanic["VIP"]==True,1,np.where(titanic["VIP"]==False,0,titanic["VIP"])) # replacing true-1,false-0 and nan


In [None]:
titanic.info()
# transported data type has change from bool to Obj


## Now take a look

In [None]:
titanic.head()

# Identifying the Data types of features 

## (RoomService, FoodCourt, ShoppingMall, Spa, VRDeck )-Numerical
### Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities

In [None]:
numerical_cols=["Age","RoomService","Spa","VRDeck","ShoppingMall","FoodCourt"]
numerical_cols

In [None]:
categorical_cols=[feature for feature in titanic.columns if feature not in numerical_cols]
categorical_cols

# Distribution Of Categorical features

In [None]:
for feature in categorical_cols:
    if feature not in['Passenger_group','firstname','last_name','Cabin_no.']:

        df=titanic[feature].value_counts().reset_index()
        
        fig=px.pie(df,values=df.columns[1],names=df.columns[0],
                   labels={"index":"<b>"+feature,feature:"Count"},
                  color_discrete_sequence=px.colors.qualitative.Pastel)

        fig.update_layout(title="<b>"+feature, title_font_size=30,
                         font_size=20,
                         title_x=0.5,legend_bordercolor="#000",legend_borderwidth=2,
                         hoverlabel_font_size=20)
        fig.update_traces(marker=dict( line=dict(color='#000000', width=2)))              
        fig.show()

    

# Observation
 * **Homeplanet**- most people was from **Earth** Followed by **Europa**, **Mars**
 * **Cryosleep** - most people was not in **CryoSleep**
 * **Destination** - mostly people was debarking to **TRAPPIST-1e** Followed by **Canri e**, **PSO** 
 * **Vip** - frequency of **non VIP** people quite large Compared **VIP** people 
 * **Transported** - no. of non transported is greater than transported people 
 * **No.inGroup** - Majority of the were travellling alone in their group
 
 * **Cabin Name**- majority of people were in G and F cabin . 
 * **Cabin side**-- NO of people on both sides is almost equal

In [None]:
# Categoical features vs Target

for feature in categorical_cols:
     if feature not in['Passenger_group','firstname','last_name','Cabin_no.',"Transported"]:
        fig=px.histogram(titanic,x=feature,facet_col="Transported",color="Transported",color_discrete_sequence=px.colors.qualitative.Alphabet_r)
        fig.update_layout(title="<b>"+feature+" vs Transported", title_font_size=30,
                             font_size=20,
                             title_x=0.5,
                             hoverlabel_font_size=20,template="plotly_dark")
        
        fig.show()

# Observation 
* **Homeplanet** - **Europa amd marks** has more transported people than **Earth** even  though earth has more no. of people                        transported  but it has lower transport %
* **Cryosleep** -  **cryosleep people** shows better transport percentage than non cryosleep people(so if you are in cryosleep                                you are most like be  transported)
* **Destination**- **Trappist-1e** has greate no of transported people but if we see who has better %of tranported people I will                                      sayits **55 cancrie** 
* **Vip**- **Non Vip person** has Equal chance of being on either side (tranported or non transported) but **Vip** has more                                     than half  the chance that they will not transported ..
* **Cabin** - Deck **B** And **C** has more chance of getting transported . **E** has more chance of getting Not tranported.
* **Cabin Side** - equal chance on both cabin being on either of side 

# Distribution in our Numerical Features

In [None]:
for feature in numerical_cols:
    fig=px.violin(titanic,x=feature,color="Transported",title="<b>"+feature+" Distribution",template="plotly_dark" )
    fig.update_layout(hovermode='x',title_font_size=30)
    fig.update_layout(
    title_font_color="#ffff00",
    template="plotly_dark",
    title_font_size=30,
    hoverlabel_font_size=20,
    title_x=0.5
    )
    fig.show()
    
    fig=px.histogram(titanic,x=feature,title="<b>"+feature+"Vs Transported",color="Transported",template="plotly_dark")
    fig.update_layout(hovermode='x',title_font_size=30)
    fig.update_layout(
    title_font_color="#ffff00",
    template="plotly_dark",
    title_font_size=30,
    hoverlabel_font_size=20,
    title_x=0.5)
    
    fig.show()

# Obervation
* **skewed data(right)**
* **all of them have outliers**
*  **Age show similar distribution except when age is b/w 0-4 where people were transported very much large than no    transported** why this age is zero??
* 

# Correlation matrix

In [None]:
px.imshow(titanic.corr().round(3),text_auto=True)

In [None]:
px.scatter_matrix(titanic[numerical_cols+["Transported"]],height=800,color="Transported")

In [None]:
fig=px.parallel_coordinates(titanic,color="Transported",title="<b>Multivariate plot for Numerical Data")
fig.update_layout(title_font_size=30,title_x=0.5)

# Observation
* **Cant see very strong relatioship in any of the numerical features**

# Not Completed  Still lot to Explore

# Cat. vs Cat. columns with Target

### this heatmap shows the % of people transported in a particular deck with a paritcula side

In [None]:
fig=px.imshow(
    pd.crosstab(titanic["Cabin_deck"],titanic["Cabin_side"]).T
    ,text_auto=True,title="No. of people Travelling in (Cabin_DecK,Cabin_side)",
    labels={"color":"<b>No.of people"},color_continuous_scale=px.colors.sequential.haline_r)
fig.update_layout(font_size=15,font_color="#ffcce6",
                 title_font_size=30,title_font_color="Orange",template="plotly_dark")
fig.show()
fig=px.imshow(
    pd.crosstab(titanic["Cabin_deck"],titanic["Cabin_side"],titanic["Transported"],aggfunc="mean").round(3).T
    ,text_auto=True,title="% of people Transported(1) in (Cabin_DecK,Cabin_side)",
    labels={"color":"<b>%Transpoted"},color_continuous_scale=px.colors.sequential.haline_r)
fig.update_layout(font_size=15,font_color="#ffcce6",
                 title_font_size=30,title_font_color="Orange",template="plotly_dark")
fig.show()

# Obseravtion 
Not so much of difference in these categories , I think not much of useful information here

In [None]:
fig=px.imshow(
    pd.crosstab(titanic["HomePlanet"],titanic["CryoSleep"]).T
    ,text_auto=True,title="<b>No. of people Travelling in (HomePlanet,Cryosleep)",
    labels={"color":"<b>No.of people"},color_continuous_scale=px.colors.sequential.haline_r)
fig.update_layout(font_size=15,font_color="#ffcce6",
                 title_font_size=30,title_font_color="Orange",template="plotly_dark")
fig.show()
fig=px.imshow(
    pd.crosstab(titanic["HomePlanet"],titanic["CryoSleep"],titanic["Transported"],aggfunc="mean").round(3).T
    ,text_auto=True,title="<b>No. of people Travelling in (HomePlanet,Cryosleep)",
    labels={"color":"<b>%Transpoted"},color_continuous_scale=px.colors.sequential.haline_r)
fig.update_layout(font_size=15,font_color="#ffcce6",
                 title_font_size=30,title_font_color="Orange",template="plotly_dark")
fig.show()

In [None]:
# this show above plot is indeed correct
titanic[(titanic["HomePlanet"]=="Europa") & (titanic["CryoSleep"]==False)]["Transported"].mean()

# Observation
* **Europa** - **Cryosleep people has 99% chance** they will get transported , and **non        crysleep has only 40% chance**

* **Mars** - cryosleep people has 91 % cahnce and non cryosleep has only 27 %

In [None]:
fig=px.parallel_categories(titanic,color="Transported",
                           title="<b>Multivarite Plot For Categorical data",height=600,width=1000)
fig.update_layout(title_font_size=30,title_x=0.5)