In [2]:
import numpy as np
import seaborn as sns
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [28]:
# df = pd.read_csv('train.csv',usecols=['Age','Pclass','SibSp','Parch','Survived'])

df = pd.read_csv('train.csv')[['Age','Pclass','SibSp','Parch','Survived']]

In [30]:
df.shape

(891, 5)

In [32]:
df.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [34]:
df.isnull().sum()

Age         177
Pclass        0
SibSp         0
Parch         0
Survived      0
dtype: int64

In [36]:
# NOTE => dropna() will remove all the rows in which we have missing values 


df.dropna(inplace = True)

In [22]:
df.isnull().sum()

Age         0
Pclass      0
SibSp       0
Parch       0
Survived    0
dtype: int64

In [38]:
df.shape

(714, 5)

In [42]:
X = df.iloc[:,0:4]
y = df.iloc[:,-1]

In [46]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [48]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Cross-validation score tells you how well your model performs on unseen data by testing it multiple times on different parts of the dataset.

In [50]:
np.mean(cross_val_score(LogisticRegression() , X , y ,scoring = 'accuracy', cv = 20))

0.6933333333333332

# Applying FEATURE CONSTRUCTION =>

In [53]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

# 1 isliye add  kiya kyuki us passenger ko bhi mai family count me add karna chahta hu

In [55]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [63]:
def myfunc(num):
    if num == 1:
        # alone--->
        return 0
    elif num > 1 and num <= 4:
        # small family --->
        return 1
    else:
        # large family---->
        return 2

In [65]:
myfunc(4)

1

In [67]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [69]:
X.head()

Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [73]:
X.drop(columns = ['SibSp','Parch','Family_size'],inplace = True)

In [75]:
X.head()

Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [79]:
np.mean(cross_val_score(LogisticRegression() , X , y ,scoring = 'accuracy',cv = 20))

0.7003174603174602

# FEATURE SPLITTING =>

In [85]:
df = pd.read_csv('train.csv')

In [87]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
372,373,0,3,"Beavan, Mr. William Thomas",male,19.0,0,0,323951,8.05,,S
455,456,1,3,"Jalsevac, Mr. Ivan",male,29.0,0,0,349240,7.8958,,C
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
443,444,1,2,"Reynaldo, Ms. Encarnacion",female,28.0,0,0,230434,13.0,,S
94,95,0,3,"Coxon, Mr. Daniel",male,59.0,0,0,364500,7.25,,S


In [89]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [93]:
df['Name'].str.split(',',expand = True)

Unnamed: 0,0,1
0,Braund,Mr. Owen Harris
1,Cumings,Mrs. John Bradley (Florence Briggs Thayer)
2,Heikkinen,Miss. Laina
3,Futrelle,Mrs. Jacques Heath (Lily May Peel)
4,Allen,Mr. William Henry
...,...,...
886,Montvila,Rev. Juozas
887,Graham,Miss. Margaret Edith
888,Johnston,"Miss. Catherine Helen ""Carrie"""
889,Behr,Mr. Karl Howell


In [97]:
df['Name'].str.split(',',expand = True)[1]

0                                  Mr. Owen Harris
1       Mrs. John Bradley (Florence Briggs Thayer)
2                                      Miss. Laina
3               Mrs. Jacques Heath (Lily May Peel)
4                                Mr. William Henry
                          ...                     
886                                    Rev. Juozas
887                           Miss. Margaret Edith
888                 Miss. Catherine Helen "Carrie"
889                                Mr. Karl Howell
890                                    Mr. Patrick
Name: 1, Length: 891, dtype: object

In [209]:
df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True).head()

Unnamed: 0,0,1,2
0,Mr,Owen Harris,
1,Mrs,John Bradley (Florence Briggs Thayer),
2,Miss,Laina,
3,Mrs,Jacques Heath (Lily May Peel),
4,Mr,William Henry,


In [211]:
df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]

0         Mr
1        Mrs
2       Miss
3        Mrs
4         Mr
       ...  
886      Rev
887     Miss
888     Miss
889       Mr
890       Mr
Name: 0, Length: 891, dtype: object

In [219]:
df['Title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]

In [221]:
df['Title']

0         Mr
1        Mrs
2       Miss
3        Mrs
4         Mr
       ...  
886      Rev
887     Miss
888     Miss
889       Mr
890       Mr
Name: Title, Length: 891, dtype: object

In [223]:
df[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [None]:
(df.groupby('Title').mean()['Survived']).sort_values(ascending = False)

# .loc[...] → This selects only those rows where the condition is True.

In [290]:
df['Is_Married'] = 0
# df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1

df.loc[df['Title'] == 'Mrs', 'Is_Married'] = 1

In [294]:
df['Is_Married']

0      0
1      0
2      0
3      0
4      0
      ..
886    0
887    0
888    0
889    0
890    0
Name: Is_Married, Length: 891, dtype: int64