In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRFClassifier, XGBClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pickle

In [24]:
data_raw=pd.read_csv('train.csv')
data_test_raw=pd.read_csv('test.csv')
data_raw.shape,data_test_raw.shape

((891, 12), (418, 11))

# EDA

In [None]:
data_raw.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [26]:
data_test_raw.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [27]:
data_raw.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [28]:
data_raw.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### EDA

### Cabin and Embarked Data are categorical, check for no of uniques in train and test, as how reliable the data in those columns are for inference dataset

In [29]:
print('Cabin Train Uniques:',data_raw['Cabin'].nunique(),', Cabin Test Uniques:',data_test_raw['Cabin'].nunique(),'\nDifference is:',data_raw['Cabin'].nunique()-data_test_raw['Cabin'].nunique())

Cabin Train Uniques: 147 , Cabin Test Uniques: 76 
Difference is: 71


In [30]:
print('Embarked Train Uniques:',data_raw['Embarked'].nunique(),', Embarked Test Uniques:',data_test_raw['Embarked'].nunique(),'\nDifference is:',data_raw['Embarked'].nunique()-data_test_raw['Embarked'].nunique())

Embarked Train Uniques: 3 , Embarked Test Uniques: 3 
Difference is: 0


In [31]:
data_raw['Ticket'].nunique(),data_raw['Fare'].nunique(),data_raw['Age'].nunique()

(681, 248, 88)

In [32]:
data_raw['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [33]:
data_raw[data_raw['Embarked']=='S']['Ticket'].nunique(),data_raw[data_raw['Embarked']=='Q']['Ticket'].nunique(),data_raw[data_raw['Embarked']=='Q']['Ticket'].nunique(),data_raw[data_raw['Embarked'].isna()]['Ticket'].nunique()

(494, 66, 66, 1)

##### Ticket and Fare are metadata 

##### As Next steps involve Feature Selection, Null Value and Data Encoding

In [35]:
## Feature Selection
cols=data_raw.columns
features=list(set(cols)-{'PassengerId','Survived','Name','Ticket','Fare','Cabin'})
label = 'Survived'

### let's first preprocess the complete data set and perform gridsearchcv to tune hyperparameter, and Split Data into train and test and later preprocess again to avoid data leakage, 

In [36]:
## suffix "pp" means preprocess in this context
data_raw_pp = data_raw[features]

#### MISSING VALUE TREATMENT

In [37]:
data_raw_pp.isnull().sum()

Pclass        0
Sex           0
SibSp         0
Parch         0
Embarked      2
Age         177
dtype: int64

#### AGE ANALYSIS

In [68]:
#### As Age is more missing values , let's analyse Age data
data_raw[data_raw['Age'].isnull()]['Ticket'].nunique()

155

In [39]:
#### Missing Values of Age is scattered around different ticket ranges, and we can not impute mean or media randonmly the age
### So Let's make Nan as 0
data_raw_pp['Age'].fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_raw_pp['Age'].fillna(0,inplace=True)


#### Embarked Analysis 

In [40]:
data_raw[data_raw['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [41]:
data_raw[data_raw['Ticket']=='113572'].shape,data_raw[data_raw['Cabin']=='B28'].shape

((2, 12), (2, 12))

In [42]:
# Embarked Nan is split across Same Ticket and Cabin

In [43]:
data_raw_pp['Embarked'].fillna('NA',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_raw_pp['Embarked'].fillna('NA',inplace=True)


#### Sex Encoder

In [44]:
sex_encoder = OneHotEncoder(drop='first',sparse_output=False)
data_raw_pp = pd.concat([data_raw_pp,pd.DataFrame(sex_encoder.fit_transform(np.array(data_raw_pp['Sex']).reshape((data_raw_pp.shape[0],-1))),columns=sex_encoder.categories_[0][1:])],axis=1)
data_raw_pp.drop('Sex',axis=1,inplace=True)

#### Embarked Label Encoder

In [47]:
data_raw_pp['Embarked'].unique()

array(['S', 'C', 'Q', 'NA'], dtype=object)

In [48]:
embarked_encoder = OneHotEncoder(sparse_output=False)
data_raw_pp = pd.concat([data_raw_pp,pd.DataFrame(embarked_encoder.fit_transform(np.array(data_raw_pp['Embarked']).reshape((data_raw_pp.shape[0],-1))),columns=embarked_encoder.categories_[0])],axis=1)
data_raw_pp.drop('Embarked',axis=1,inplace=True)

In [None]:
pickle.dump(sex_encoder,open('gender_encoder.sav','wb'))
pickle.dump(embarked_encoder,open('embarked_encoder.sav','wb'))

In [49]:
data_raw_pp.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,male,C,NA,Q,S
0,3,1,0,22.0,1.0,0.0,0.0,0.0,1.0
1,1,1,0,38.0,0.0,1.0,0.0,0.0,0.0
2,3,0,0,26.0,0.0,0.0,0.0,0.0,1.0
3,1,1,0,35.0,0.0,0.0,0.0,0.0,1.0
4,3,0,0,35.0,1.0,0.0,0.0,0.0,1.0


In [59]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

model_params = {
    'Random Forest':{
    'model':RandomForestClassifier()
    ,'params':{
        'n_estimators': [300,400,500]
        ,'max_depth' : [30,60,80]
        ,'max_features':[0.7,0.8]
    }
    },
    'Gradient Boosting':{
        'model':GradientBoostingClassifier(verbose=1,random_state=18,loss='exponential')
        ,'params':{
            'n_estimators': [300,500]
            ,'learning_rate':[0.05,0.1]
            ,'max_depth' : [40,60]
            ,'max_features':[0.8,0.9,1]
            # ,'subsample':[0.8,0.9] As Data is less, no need for sub sample
            # 'loss':['exponential']
        }
    },
    'XGBClassifier':{
        'model':XGBClassifier(random_state=18)
        ,'params':{
            'n_estimators': [400,500,600]
            ,'colsample_bynode' : [0.8,0.9,1]
            ,'learning_rate':[0.05,0.1]
            ,'max_depth' : [60,80]
            ,'reg_alpha' : [10,15]
        }
    },
    'XGBRFClassifier':{
        'model':XGBRFClassifier(random_state=18)
        ,'params':{
            'n_estimators': [400,500,600]
            ,'colsample_bynode' : [0.8,0.9,1]
            ,'learning_rate':[0.05,0.1]
            ,'max_depth' : [60,80]
            ,'reg_alpha' : [10,15]
        }
    },
    'Logistic Regression':{
        'model':LogisticRegression(random_state=18,multi_class='auto')
        ,'params':{
            'penalty':['l1','l2','elasticnet']
            ,'solver':['lbfgs', 'liblinear', 'newton-cg','saga']
            ,'max_iter':[250,350,500]
        }
    }

}

In [60]:
cvs = []
for mod_param in model_params.values():
    cv = GridSearchCV(mod_param['model'],param_grid=mod_param['params'],cv=11)
    cv.fit(data_raw_pp,data_raw[label])
    cvs.append(cv)

      Iter       Train Loss   Remaining Time 
         1           0.9335            8.69s
         2           0.8964            4.56s
         3           0.8612            7.89s
         4           0.8277            6.77s
         5           0.7960            6.72s
         6           0.7659            6.92s
         7           0.7374            6.25s
         8           0.7103            6.02s
         9           0.6846            5.91s
        10           0.6602            5.72s
        20           0.4754            5.71s
        30           0.3656            5.02s
        40           0.2999            4.74s
        50           0.2604            4.49s
        60           0.2366            4.33s
        70           0.2222            4.13s
        80           0.2135            3.91s
        90           0.2082            3.67s
       100           0.2050            3.50s
       200           0.2001            1.64s
       300           0.2001            0.00s
      Ite

198 fits failed out of a total of 396.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
33 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\KumarVe\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\KumarVe\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\KumarVe\AppData\Roaming\Python\Python311\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, s

In [69]:
best_score_idx,best_score = 0,0
for idx,cv in enumerate(cvs):
    if cv.best_score_>best_score:
        best_score_idx,best_score = idx,cv.best_score_ 
print(best_score_idx,best_score)

0 0.8125701459034791


In [73]:
cvs[best_score_idx].best_estimator_,cvs[best_score_idx].best_score_,cvs[best_score_idx].best_params_

(RandomForestClassifier(max_depth=60, max_features=0.8, n_estimators=500),
 0.8125701459034791,
 {'max_depth': 60, 'max_features': 0.8, 'n_estimators': 500})

In [82]:
pickle.dump(cvs[best_score_idx].best_estimator_,open('RandomForestClassifier.sav','wb'))

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data_raw[features],data_raw[label])

In [11]:
## SEX,CABIN,EMBARKED ARE CATEGORICAL
#CONVERT INTO NUMERICAL
print(x_train['Sex'].unique())
x_train['Sex']=np.where(x_train['Sex']=='male',0,1)
print(x_train['Sex'].unique())

['male' 'female']
[0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Sex']=np.where(x_train['Sex']=='male',0,1)


In [12]:
x_train=pd.get_dummies(x_train,columns=['Cabin','Embarked'])

In [13]:
print(len(cols))
print(x_train.columns.shape)

8
(158,)


In [14]:
features= list(x_train.columns)
print(len(features))
print(np.ceil(np.sqrt(len(features))))

158
13.0


In [15]:
y_train=data_raw['Survived']

In [16]:
clf=XGBClassifier(max_depth=len(features),random_state=1,max_features='sqrt')

### READ TEST DATA, PERFORM MISSING VALUE TREATMENT AND CONVERT CATEGORICAL TO NUMERICAL VALUES

In [17]:
data_test_raw=pd.read_csv('test.csv')

In [18]:
data_test_raw.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [19]:
data_test_raw.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [20]:
x_test=data_test_raw[cols]

In [21]:
x_test['Age'].fillna(0,inplace=True)
x_test['Cabin'].fillna('NA',inplace=True)
x_test['Embarked'].fillna('NA',inplace=True)
x_test['Fare'].fillna(-1,inplace=True)
x_test.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Age'].fillna(0,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Cabin'].fillna('NA',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Embarked'].fillna('NA',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Fare'].fillna(-1,inplace=True)


Age         0
Embarked    0
Sex         0
Parch       0
Fare        0
SibSp       0
Cabin       0
Pclass      0
dtype: int64

In [22]:
print(x_test['Sex'].unique())
x_test['Sex']=np.where(x_test['Sex']=='male',0,1)
print(x_test['Sex'].unique())
x_test=pd.get_dummies(x_test,columns=['Cabin','Embarked'])

print(data_test_raw.columns.shape)
print(x_test.columns.shape)

['male' 'female']
[0 1]
(11,)
(86,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Sex']=np.where(x_test['Sex']=='male',0,1)


In [23]:
print(data_raw['Cabin'].nunique())
print(data_test_raw['Cabin'].nunique())

147
76


#### TRAIN SET HAS MORE FEATURES BCS ONE HOT ENCODED COLUMN CABIN HAS MORE VALUES IN TRAIN SET, IN TEST WE HAVE FEWER

### HENCE ADD MISSING FEATURE FOR CABIN AND EMBARKED COLUMNS WITH 0 AS VALUES

In [26]:
missing_features_test=set(x_train.columns)-set(x_test.columns)
print(len(missing_features_test))
missing_features_train=set(x_test.columns)-set(x_train.columns)
print(len(missing_features_train))
print(len(missing_features_train)+len(x_train.columns))
print(len(missing_features_test)+len(x_test.columns))

111
39
197
197


In [27]:
print(missing_features_train)

{'Cabin_C132', 'Cabin_D43', 'Cabin_B45', 'Cabin_C80', 'Cabin_B61', 'Cabin_A11', 'Cabin_B10', 'Cabin_C31', 'Cabin_C89', 'Cabin_C28', 'Cabin_A18', 'Cabin_B36', 'Cabin_D40', 'Cabin_E60', 'Cabin_A29', 'Cabin_E45', 'Cabin_A9', 'Cabin_F E57', 'Cabin_E52', 'Cabin_C55 C57', 'Cabin_C6', 'Cabin_B52 B54 B56', 'Cabin_C116', 'Cabin_C53', 'Cabin_D38', 'Cabin_B11', 'Cabin_A21', 'Cabin_F', 'Cabin_C97', 'Cabin_D34', 'Cabin_C105', 'Cabin_E39 E41', 'Cabin_B26', 'Cabin_C51', 'Cabin_C39', 'Cabin_F E46', 'Cabin_C130', 'Cabin_B24', 'Cabin_D22'}


In [25]:
for col in missing_features_train:
    x_train[col]=0
for col in missing_features_test:
    x_test[col]=0
print(len(x_train.columns),len(x_test.columns))

197 197


  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0
  x_test[col]=0


In [25]:
print(len(x_train.columns),len(x_test.columns))

158 86


In [27]:
#x_train.describe()
#x_test.describe()

In [28]:
#Fare has large standard deviation, Standardize the Feature Fare
#from sklearn.preprocessing import StandardScaler
#scaler=StandardScaler()
#print(scaler.mean_,scaler.std_)
#scaler.fit(x_train['Fare'])
#print(scaler.mean_,scaler.std_)
#type(scaler.transform(x_train['Fare']))

In [29]:
def scaler(sr):
    mean=sr.mean()
    std=sr.std()
    sr=sr.apply(lambda x:(x-mean)/std)
    return sr

In [30]:
x_train_Fare_Scaler=scaler(x_train['Fare'])

In [31]:
print(x_train['Fare'].describe())
print(x_train_Fare_Scaler.describe())

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64
count    8.910000e+02
mean    -1.196200e-17
std      1.000000e+00
min     -6.480577e-01
25%     -4.888737e-01
50%     -3.571902e-01
75%     -2.423274e-02
max      9.661740e+00
Name: Fare, dtype: float64


In [25]:
x_train['Fare']=x_train_Fare_Scaler
print(x_train.shape)
x_train=x_train.drop('Fare',axis=1)
print(x_train.shape)

NameError: name 'x_train_Fare_Scaler' is not defined

In [66]:
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import confusion_matrix,r2_score
train_x,val_x,train_y,val_y=split(x_train,y_train,test_size=0.2,random_state=1)

In [67]:
from sklearn.neighbors import KNeighborsClassifier
n_neighbors=[1,2,3,4,5,6,7,8,9,10]
weights=['uniform','distance']
knn=KNeighborsClassifier(n_neighbors=10,weights='distance')
knn.fit(train_x,train_y)
val_pred=knn.predict(val_x)
cfm=confusion_matrix(val_y,val_pred)
print(cfm)
r2_score(val_y,val_pred)

[[89 17]
 [35 38]]


-0.20289480485913702

In [68]:
clf=RandomForestClassifier(max_depth=len(features),random_state=1,max_features='sqrt')
clf.fit(train_x,train_y)
val_pred=clf.predict(val_x)
cfm=confusion_matrix(val_y,val_pred)
print(cfm)
r2_score(val_y,val_pred)

[[86 20]
 [22 51]]


0.028431119152235507

In [33]:
clf.fit(x_train,y_train)

RandomForestClassifier(max_depth=158, max_features='sqrt', random_state=0)

### TRAIN DATA EVALUATION

In [34]:
y_train_pred=clf.predict(x_train)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train,y_train_pred)

array([[546,   3],
       [  9, 333]], dtype=int64)

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,y_train_pred)

0.9865319865319865

In [36]:
x_test_Fare_Scaler=scaler(x_test['Fare'])

In [37]:
print(x_test['Fare'].describe())
print(x_test_Fare_Scaler.describe())

count    418.000000
mean      35.539564
std       55.869231
min       -1.000000
25%        7.895800
50%       14.454200
75%       31.471875
max      512.329200
Name: Fare, dtype: float64
count    4.180000e+02
mean    -5.843279e-17
std      1.000000e+00
min     -6.540195e-01
25%     -4.947941e-01
50%     -3.774057e-01
75%     -7.280731e-02
max      8.534029e+00
Name: Fare, dtype: float64


In [38]:
x_test['Fare']=x_test_Fare_Scaler

In [39]:
predicted_values=clf.predict(x_test)

Feature names must be in the same order as they were in fit.



In [40]:
len(predicted_values)

418

In [41]:
final_data=pd.DataFrame({'PassengerId':data_test['PassengerId'],'Survived':predicted_values})

In [42]:
final_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [41]:
#final_data.to_csv('prediction_results.csv')