In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv',usecols = ['Survived','Pclass','Sex','Age','Fare','Embarked','Cabin'])
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin,Embarked
0,0,3,male,22.0,7.25,,S
1,1,1,female,38.0,71.2833,C85,C
2,1,3,female,26.0,7.925,,S
3,1,1,female,35.0,53.1,C123,S
4,0,3,male,35.0,8.05,,S


In [3]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [4]:
df.shape

(891, 7)

In [5]:
df.Age = df.Age.fillna(df.Age.mean())

In [6]:
df.Cabin = df.Cabin.fillna(df.Cabin.mode()[0])
df.Embarked = df.Embarked.fillna(df.Embarked.mode()[0])


# df.Cabin = df.Cabin.fillna(df.Cabin.mode()) seems to have a syntax error. 
# The fillna() method expects a scalar value or a dictionary to fill the missing values.
# df.Cabin.mode() returns a Series object containing the mode value(s),
# so it cannot be directly used as an argument for fillna(). 
# You would need to extract the mode value explicitly by using df.Cabin.mode()[0] 
# or another appropriate method before using it in fillna().



In [7]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [8]:
df.Embarked.mode()[0]

'S'

In [9]:
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [10]:
df.Embarked.unique() #array(['S', 'C', 'Q'], dtype=object)
df.Cabin.unique() #array(['B', 'C', 'G', 'E', 'D', 'A', 'F', 'T'], dtype=object)

array(['B96 B98', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'E10', 'E44', 'A34',
       'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37',
       'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39',
       'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41',
       'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58',
       'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64', 'E24

In [11]:
df.Cabin = df.Cabin.str[0]

In [12]:
for i in df.columns[df.dtypes == 'object']:
    obj = df.pop(i)
    dum = pd.get_dummies(obj,prefix=i , drop_first = True).astype('int')
    df = pd.concat([df,dum],axis =1)



# Identify columns with object data type
# object_columns = df.select_dtypes(include='object').columns

# for col in object_columns:
#     dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
#     df = pd.concat([df, dummies], axis=1)
#     df.drop(col, axis=1, inplace=True)


In [13]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_Q,Embarked_S
0,0,3,22.0,7.25,1,1,0,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,0,0,1,0,0,0,0,0,0,0
2,1,3,26.0,7.925,0,1,0,0,0,0,0,0,0,1
3,1,1,35.0,53.1,0,0,1,0,0,0,0,0,0,1
4,0,3,35.0,8.05,1,1,0,0,0,0,0,0,0,1


In [14]:
# from sklearn.model_selection import train_test_split
# x_train

In [15]:
x = df.drop('Survived',axis = 1)
y = df.Survived

In [16]:
from sklearn.ensemble import RandomForestClassifier
RD = RandomForestClassifier()

In [17]:
# help(RandomForestClassifier)
# dir(RandomForestClassifier)

In [18]:
RD.fit(x,y)

In [19]:
RD.score(x,y)

0.9831649831649831

In [20]:
y_pred = RD.predict([[1,38,71,0,0,1,0,0,0,0,0,0,1]])



In [21]:
df.head(2)

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_male,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_Q,Embarked_S
0,0,3,22.0,7.25,1,1,0,0,0,0,0,0,0,1
1,1,1,38.0,71.2833,0,0,1,0,0,0,0,0,0,0


In [22]:
# from sklearn.metrics import confusion_matrix
# cm = confusion_matrix(x,y)