# **IMPORTING LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **LOADING THE DATASET**

In [None]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_ids = df_test["PassengerId"]
df

In [None]:
df.head()

# **EDA AND VISUALIZATION**

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df_test.dtypes

In [None]:
sns.countplot(x='Survived', data = df, hue ='Pclass')

In [None]:
sns.distplot(df['Age'])

In [None]:
plt.pie(df.Sex.value_counts(),[0.1,0],labels=["Male","Female"],autopct="%1.1f%%",colors=["blue","pink"])
plt.title("Sex")
plt.show()

In [None]:
class_fare = df.pivot_table(index='Pclass', values='Fare', aggfunc=np.sum)
class_fare.plot(kind='bar')
plt.xlabel('Pclass')
plt.ylabel('Total Fare')
plt.xticks(rotation=0)
plt.show()

In [None]:
sns.barplot(data=df, x='Pclass', y='Fare', hue='Survived')

In [None]:
sns.boxplot(x='Pclass', y='Age',data=df)

In [None]:
plt.figure(figsize=[10,9])
sns.heatmap(df.corr(),annot=True)

# **PREPROCESSING**

**CHECKING FOR NULL VALUES**

In [None]:
df.isnull().any()

In [None]:
df.isnull().sum()

In [None]:
df_test.isnull().sum()

**DROPPING SOME UNECCESSARY COLUMNS**

In [None]:
df.drop(['PassengerId','Name','Cabin','Ticket'], axis=1, inplace=True)
df_test.drop(['PassengerId','Name','Cabin','Ticket'], axis=1, inplace=True)

**FILLING MISSING VALUES IN 'Age' COLUMN**

In [None]:
print(df[df['Pclass'] == 1]['Age'].mean()) 
print(df[df['Pclass'] == 2]['Age'].mean()) 
print(df[df['Pclass'] == 3]['Age'].mean())

In [None]:
print(df_test[df_test['Pclass'] == 1]['Age'].mean()) 
print(df_test[df_test['Pclass'] == 2]['Age'].mean()) 
print(df_test[df_test['Pclass'] == 3]['Age'].mean())

In [None]:
def fill_in_na_values(cols):
  age = cols[0]
  pclass = cols[1]
  if pd.isnull(age):
    if pclass == 1:
      return round (df[df['Pclass'] == 1]['Age'].mean()) 
    elif pclass == 2:
      return round (df[df['Pclass'] == 2]['Age'].mean()) 
    elif pclass == 3:
      return round(df[df['Pclass'] == 3]['Age'].mean())
  else:
    return age

df['Age'] = df[['Age','Pclass']].apply(fill_in_na_values, axis = 1)
df_test['Age'] = df_test[['Age','Pclass']].apply(fill_in_na_values, axis = 1)

**DROPPING NAN VALUES**

In [None]:
df.dropna(inplace=True)
df_test.dropna(inplace=True)

In [None]:
df

In [None]:
df_test

# **CHECK FOR CATEGORICAL COLUMNS AND PERFORM ENCODING**

**Label Encoding**

In [None]:
#Checking for Categorical Columns
df.select_dtypes(include=['object']).columns.tolist()

In [None]:
#Checking for Categorical Columns
df_test.select_dtypes(include=['object']).columns.tolist()

In [None]:
df['Embarked'].unique()

In [None]:
df_test['Embarked'].unique()

In [None]:
df['Sex'].unique()

In [None]:
df_test['Sex'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
df.Sex=le.fit_transform(df.Sex)
df

In [None]:
df_test.Sex=le.fit_transform(df_test.Sex)
df_test

In [None]:
df=pd.get_dummies(df,columns=['Embarked'])
df.head()

In [None]:
df_test=pd.get_dummies(df_test,columns=['Embarked'])
df_test.head()

# **SPLITTING INTO INDEPENDENT AND DEPENDENT VARIABLES**

In [None]:
#dedpendent variable
y=df['Survived']
y

In [None]:
#independent variable
X=df.drop(columns=['Survived'],axis=1)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)
#0.3 indicates 30% test dataset and remaining 70% training dataset which is ideal size of dataset for ml algorithms training and testing 

In [None]:
X_train

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
def classify(model):
    model.fit(X_train, y_train)
    print('Accuracy:', model.score(X_test, y_test))

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
classify(model)

In [None]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(verbose=0)
classify(model)

In [None]:
from sklearn import svm
model = svm.LinearSVC(random_state=20)
classify(model)

In [None]:
from sklearn.naive_bayes import GaussianNB 
model = GaussianNB()  
classify(model)

Note: This is an unfinished notebook and test part is not done. This doesn't have submission file.