In [None]:
!pip install pycaret

In [None]:
import pandas as pd 
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC


In [None]:
%matplotlib inline
sns.set(color_codes=True)
pal = sns.color_palette("Set2", 10)
sns.set_palette(pal)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
train.info()

In [None]:
train.head()

## Sex-Survived Comparison

In [None]:
sns.countplot(x='Sex',hue='Survived',data=train)

In [None]:
train.isnull().sum()

## Handling Missing values for
1. Age
2. Ticket
3. Fare
4. Embarked
5. Cabin

In [None]:
##Age
sns.displot(train, x="Age", kind="kde",hue='Survived')

In [None]:
plt.hist(x=train.Age, bins=10)
plt.grid(axis='y', alpha=0.5)
plt.xlabel('Age')

In [None]:
print('For Missing Values')
print('Total no. of observations:', train[train['Age'].isnull()].shape[0])
a = train[train['Age'].isnull()].groupby('Survived').Survived.count()
b =  pd.DataFrame({'Survived': a.index, '%age':a.values*100/a.values.sum()})
print(b.to_string(index=False))

In [None]:
mu=int(train['Age'].mean())
print('For mean')
print('Total no. of observations:', train[train['Age']==mu].shape[0])
a = train[train['Age']==mu].groupby('Survived').Survived.count()
b =  pd.DataFrame({'Survived': a.index, '%age':a.values*100/a.values.sum()})
print(b.to_string(index=False))

### Distribution matches with mean

In [None]:
train['Age'].fillna(train['Age'].mean(),inplace=True)
test['Age'].fillna(train['Age'].mean(),inplace=True)

In [None]:
##Ticket
sns.displot(train, x="Fare", kind="kde",hue='Survived')

### A very imbalanced distribution in which we can see high fare passengers survived more than others

In [None]:
train.groupby('Pclass').Fare.mean()

### As we can see Fare is corelated to the Pclass so we can impute accordingly

In [None]:
c1 = 92.65
c2 = 25.76
c3 = 20.62
l=[]
s=set(train['Fare'])
for i in range(train['Fare'].shape[0]):
    if train['Fare'][i] not in s:
        if train['Pclass'][i]==1:
            l.append(c1)
        elif train['Pclass'][i]==2:
            l.append(c2)
        else:
            l.append(c3)
    else:
        l.append(train['Fare'][i])
        
train['Fare'] = l

l=[]
s=set(test['Fare'])
for i in range(test['Fare'].shape[0]):
    if test['Fare'][i] not in s:
        if test['Pclass'][i]==1:
            l.append(c1)
        elif test['Pclass'][i]==2:
            l.append(c2)
        else:
            l.append(c3)
    else:
        l.append(test['Fare'][i])
        
test['Fare'] = l

In [None]:
##Ticket
train.groupby('Ticket').Ticket.count().sort_values(ascending=False).head(20)


In [None]:
train.groupby(['Ticket','Survived']).Survived.count().sort_values(ascending=False).head(20)

In [None]:
train['Ticket'] = train['Ticket'].str.replace('[^a-zA-Z]', 'r').str[:1]

test['Ticket'] = test['Ticket'].str.replace('[^a-zA-Z]', 'r').str[:1]

In [None]:
train['Ticket'].fillna('Random',inplace=True)
l=[]
for i in train['Ticket']:
    if i == 'r':
        l.append('Random')
    else:
        l.append(i)
        
train['Ticket'] = l


test['Ticket'].fillna('Random',inplace=True)
l=[]
for i in test['Ticket']:
    if i == 'r':
        l.append('Random')
    else:
        l.append(i)
        
test['Ticket'] = l


In [None]:
train.groupby('Ticket').Ticket.count().sort_values(ascending=False).head(20)

In [None]:
sns.countplot(x='Ticket',hue='Survived',data=train)

In [None]:
#Embarked
train.groupby('Embarked').Embarked.count()

In [None]:
sns.countplot(x='Embarked',hue='Survived',data=train)

In [None]:
train.groupby(['Embarked','Survived']).Survived.count()

In [None]:
train[train['Embarked'].isnull()].groupby('Survived').Survived.count()

### Here we have two options to impute the mode that is **S** or impute the category which has most equal ratio as survived that is **Q** So we can do trial and error

In [None]:
# imputing Q
l=[]
for i in train['Embarked']:
    if i in[np.nan]:
        l.append('S')
    else:
        l.append(i)

train['Embarked']=l

l=[]
for i in test['Embarked']:
    if i in[np.nan]:
        l.append('S')
    else:
        l.append(i)

test['Embarked']=l

## Cabin

In [None]:
train['Cabin'].value_counts()

In [None]:
train['Cabin'] = train['Cabin'].str[0:1]
train['Cabin'].fillna('Random',inplace=True)
test['Cabin'] = test['Cabin'].str[0:1]
test['Cabin'].fillna('Random',inplace=True)

In [None]:
train['Cabin'].value_counts()

In [None]:
sns.countplot(x='Cabin',hue='Survived',data=train)

## Every missing value has been handled

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['LastName'] = train['Name'].str.split(', ').str[0:1]
test['LastName'] = test['Name'].str.split(', ').str[0:1]

In [None]:
l=[]
for i in train['LastName']:
    
    l.append(i[0])
    
train['LastName'] = l


l=[]
for i in test['LastName']:
    l.append(i[0])
    
test['LastName'] = l

In [None]:
df = train.groupby('LastName').Survived.agg(['count','mean']).sort_values(by='count',ascending=False)
df = df.reset_index()
df.head(20)


In [None]:
df2 = test.groupby('LastName').LastName.agg(['count']).sort_values(by='count',ascending=False)
df2 = df2.reset_index()
df2.head(20)


In [None]:
d = {}
for i in range(df.shape[0]):
    d[df['LastName'][i]] = df['count'][i]

for i in range (df2.shape[0]):
    if df2['LastName'][i] not in d.keys():
        d[df2['LastName'][i]] = 0
    d[df2['LastName'][i]] += df2['count'][i]

In [None]:
l=[]
for i in train['LastName']:
    if(d[i]>=5):
        l.append(i)
    else:
        l.append('RandomSurname')
    
train['LastName'] = l


l=[]
for i in test['LastName']:
    if(d[i]>=5):
        l.append(i)
    else:
        l.append('RandomSurname')
    
test['LastName'] = l

In [None]:
## Encoding CATEGORICAL variables Sex, Embarked
le =LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])
train['Embarked'] = le.fit_transform(train['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])
train['Ticket'] = le.fit_transform(train['Ticket'])
test['Ticket'] = le.transform(test['Ticket'])
le.fit(list(train['LastName'])+list(test['LastName']))
train['LastName']=le.transform(train['LastName'])
test['LastName'] = le.transform(test['LastName'])
train['Cabin']=le.fit_transform(train['Cabin'])
test['Cabin'] = le.transform(test['Cabin'])

In [None]:
sns.countplot(x='Parch',hue='Survived',data=train)

In [None]:
sns.countplot(x='SibSp',hue='Survived',data=train)

### We can make another feature corelating with both Parch and Sibsp

In [None]:
train['FamOnBoard'] = train['Parch']+train['SibSp']
test['FamOnBoard'] = test['Parch']+test['SibSp']
l=[]
l2=[]
for i in train['FamOnBoard']:
    if i ==0:
        l.append(1)
    else:
        l.append(0)


train['Alone'] = l




l=[]
l2=[]
for i in test['FamOnBoard']:
    if i ==0:
        l.append(1)
    else:
        l.append(0)

test['Alone'] = l


In [None]:
X = train.drop(['PassengerId','Survived','Name'],axis=1)
X_test = test.drop(['PassengerId','Name'],axis=1)   
y = train['Survived']

In [None]:
X.head()

In [None]:
sns.countplot(x='FamOnBoard',hue='Survived',data=train)

In [None]:
sns.countplot(x='Alone',hue='Survived',data=train)

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(X.corr(),annot=True,vmin=-1,vmax=1,cmap='coolwarm')

In [None]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2,random_state=42)

# Model Selection using Pycaret 🥕  

## Comparing baselines

In [None]:
from pycaret.classification import *
clf1 = setup(data=train.drop(['PassengerId','Name'],axis=1),target='Survived',
             categorical_features= ['Pclass','Sex','Ticket','Embarked','Cabin'],
             silent=True,train_size=0.8,
             normalize=True, 
             create_clusters=True,
             ignore_low_variance=True,
             polynomial_features=True, 
             trigonometry_features=True,
             feature_interaction=True,
             feature_ratio=True,
             feature_selection=True )

In [None]:
models()

In [None]:
compare_models()

## Creating and Tuning Top 3 models

In [None]:
lgbm = create_model('lightgbm')

In [None]:
tuned_lgbm = tune_model(lgbm)

In [None]:
cb = create_model('catboost',fold=5)

In [None]:
tuned_cb = tune_model(cb,fold=5)

In [None]:
gbc = create_model('gbc',fold=5)


In [None]:
tuned_gbc = tune_model(gbc,fold=5)

## Blending

In [None]:
blender = blend_models(estimator_list = [tuned_lgbm,tuned_cb,tuned_gbc], method = 'soft')

In [None]:
blender2 = blend_models(estimator_list = [lgbm,cb,gbc], method = 'soft')

## Stacking

In [None]:
stacker = stack_models(estimator_list = [tuned_lgbm,tuned_cb,tuned_gbc], meta_model=tuned_lgbm)

In [None]:
plot_model(lgbm)

In [None]:
plot_model(tuned_lgbm)

In [None]:
plot_model(cb)

In [None]:
plot_model(tuned_cb)

In [None]:
plot_model(gbc)

In [None]:
plot_model(tuned_gbc)

In [None]:
plot_model(blender)

In [None]:
plot_model(stacker)

## Making Predictions

### Individual model predictions

In [None]:
df = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predict_model(tuned_lgbm,data=X_test)['Label']})
df.to_csv('submit.csv',index=False)

In [None]:
df2 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predict_model(tuned_cb,data=X_test)['Label']})
df2.to_csv('submit2.csv',index=False)

In [None]:
df3 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predict_model(tuned_gbc,data=X_test)['Label']})
df3.to_csv('submit3.csv',index=False)

### Blender Model Predictions

In [None]:
df4 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predict_model(blender,data=X_test)['Label']})
df4.to_csv('submit4.csv',index=False)

### Stacker Model Predictions

In [None]:
df5 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predict_model(stacker,data=X_test)['Label']})
df5.to_csv('submit5.csv',index=False)

In [None]:
df6 = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predict_model(lgbm,data=X_test)['Label']})
df6.to_csv('submit6.csv',index=False)