In [158]:
import numpy  as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer



In [159]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
test=pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train.head()

In [None]:
test.head()


In [None]:
train.describe()

In [None]:
train.info()

In [None]:
def data_info(df):
    num_cols=df.select_dtypes(['number']).columns
    cat_cols=df.select_dtypes(['object']).columns
    print("---------------------------")
    print("numerical columns:",num_cols)
    print(df[num_cols].isna().sum())
    print("---------------------------")
    print("categorical columns:",cat_cols)
    print(df[cat_cols].isna().sum())
    
    for col in cat_cols:
        if len(df[cat_cols].value_counts())>10:
            print(col,"has more than 10 category")
        else:
            print(df[cat_cols].value_counts())
    print("--------------------------- End")
    
data_info(train)
data_info(test)

In [None]:
Numeric_cols=train.select_dtypes(include=['number']).columns.tolist()
categorical_cols=train.select_dtypes(include=['object','category']).columns.tolist()


In [None]:
test=test.drop(columns='PassengerId',axis=1)
train=train.drop(columns='PassengerId',axis=1)



In [None]:
train.head()

## **fill Nan values**

In [None]:
train.isna().sum(),test.isna().sum()

In [None]:
train.info()

In [None]:
def fill_nan(df):
    imputer_med = SimpleImputer(missing_values=np.nan, strategy="median")
    imputer_med.fit(df[['Age']])  
    df['Age'] = imputer_med.transform(df[['Age']])

    imputer_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
    imputer_mean.fit(df[['Fare']]) 
    df['Fare'] = imputer_mean.transform(df[['Fare']])

    return df


train=fill_nan(train)
test=fill_nan(test)



In [None]:
def fill_nan_cat(df):
    cat_cols=['Embarked']
    imputer_cat=SimpleImputer( strategy="most_frequent")
    imputer_cat.fit(df[cat_cols])
    df[cat_cols]=imputer_cat.transform((df[cat_cols]))

    return df
train=fill_nan_cat(train)
test=fill_nan_cat(test)

In [None]:
train

In [None]:
if train['Cabin'].isna().sum()>0:
    train['Cabin']=train['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
if test['Cabin'].isna().sum()>0:
    test['Cabin']=test['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

In [None]:
train

In [None]:
test

## **Set Target**

In [None]:
def cleaning_data(df):
    df['Title']=df['Name'].apply(lambda x:(x.split(",")[1]).split('.')[0])
    df['Family_member']=df['SibSp']+df['Parch']
    df=df.drop(columns=['Name','Ticket','SibSp','Parch'],axis=1)
    
    return df
    
train=cleaning_data(train)
test=cleaning_data(test)
train

In [None]:
train['Title'].isna().sum(),test['Title'].isna().sum()

In [None]:
train['Title'].unique()

In [None]:
train.info()

In [None]:
test.info()

In [None]:

def get_plot(df):
    cols=df.columns
    for col in cols:
        plt.figure(figsize=(6,3))
        sns.countplot(x=col, data=df, hue = 'Survived')
        plt.show()
get_plot(train)


In [None]:
train.info()

## **Data Encoding**

ohe=OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
data=ohe.fit_transform(train_ds[['Sex']])
train_ds=pd.concat([train_ds,data], axis=1)
train_ds=train_ds.drop(columns='Sex', axis=1)

data=ohe.fit_transform(test_ds[['Sex']])
test_ds=pd.concat([test_ds,data], axis=1)
test_ds=test_ds.drop(columns='Sex', axis=1)


In [None]:
ohe=OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')
data=ohe.fit_transform(train[['Embarked']])
train=pd.concat([train,data], axis=1)
train=train.drop(columns='Embarked', axis=1)

data=ohe.fit_transform(test[['Embarked']])
test=pd.concat([test,data], axis=1)
test=test.drop(columns='Embarked', axis=1)

In [None]:
def replace_value(df):
    df['Sex']=df['Sex'].replace({'female':1, 'male':2})
    df['Pclass']=df['Pclass'].replace({'female':1, 'male':2})

In [None]:
wegrgfefe

In [None]:
train.info()

In [None]:
train_ds.head()

In [None]:
test_ds.head()

In [None]:
train_ds.info()

In [None]:
test_ds.info()

In [None]:
import seaborn as sns

#correlation_matrix = train_ds.corr()

correlation_with_target = train_ds.corrwith(target)

# Plot the correlation values
plt.figure(figsize=(10, 6))
correlation_with_target.plot(kind='bar', color='skyblue')
plt.title('Correlation with Target Feature (Survived)')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.show()

In [None]:
test_ds=test_ds.drop(columns='Embarked_Q', axis=1)
train_ds=train_ds.drop(columns='Embarked_Q', axis=1)

In [None]:
target=train['Survived']

train=train.drop(columns='Survived',axis=1)

## **Normalization (Min-Max scaling):

Scales the data between a specified range (usually 0 and 1).**

In [None]:
train_data_col=train_ds.columns
test_data_col=test_ds.columns

scaler=MinMaxScaler()
train_ds = scaler.fit_transform(train_ds)
test_ds = scaler.transform(test_ds)

train_ds = pd.DataFrame (train_ds , columns=[train_data_col])
test_ds  = pd.DataFrame (test_ds , columns=[test_data_col])

In [None]:
test_ds

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(train_ds, target, test_size=0.2, random_state=42)
accuracy_model = RandomForestClassifier()




In [None]:
param_grid={
    'n_estimators': [50, 75,  100,  150],
    'max_depth': [None, 10,15, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search=GridSearchCV(estimator=accuracy_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_param=grid_search.best_params_
best_model=grid_search.best_estimator_
accuracy = best_model.score(X_test, y_test)
accuracy

In [None]:
best_param

In [None]:
accuracy_model.fit(train_ds, target)
importance=best_model.feature_importances_
n=0
for i in importance:
    print(train_data_col[n],i)
    n=n+1


In [None]:
#cols=train_ds.columns
train_data_col

In [None]:
model = RandomForestClassifier(max_depth=10,min_samples_split=10, n_estimators=150)
model.fit(train_ds, target)

In [None]:
y_pred = model.predict(test_ds)

In [None]:
submission=pd.DataFrame()
submission['PassengerId']=PassengerId
submission['Survived']=pd.Series(y_pred)

In [None]:
submission.head()


submission.to_csv('Titanic_Machine_Learning_from_Disaster.csv',index=False)
