In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
dir_actual = os.getcwd()
path_data = dir_actual + "\\data"

In [3]:
test_data = pd.read_csv(path_data+'\\train.csv')

# Análisis exploratorio

Total rows: 891
- Columns under total rows (Nan values): Age(714), Cabin(204), Embarked (889)

Improve performance:
- Drop: 'Name'
- Sex: str -> int64
- Age: float64 -> int64
- Ticket: str -> int64 (Have non-numerical expression i.e. A/ )
- Embarked: str -> int64

In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
test_data.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [6]:
# test_data[test_data['Age'].isnull()]

# Data Manipulation

Changed:
- Sex: str -> int64
- Embarked: str -> int64 + 2 Nan values, replaces for most_coomon_embarked
- Drop: 'Name'

Pending:
- Age: float64 -> int64
- Ticket: str -> int64 (Have non-numerical expression i.e. A/ )

In [7]:
mapping_sex = {'male':0,'female':1}
most_common_embarked = test_data['Embarked'].mode()[0]
mapping_embarked = {'Q':0,'S':1,'C':2}
dropped_columns = ['Name','Ticket','Cabin']

In [8]:
most_common_embarked

'S'

In [9]:
test_data['Sex'] = test_data['Sex'].map(mapping_sex)
test_data['Embarked'] = test_data['Embarked'].fillna(most_common_embarked)
test_data['Embarked'] = test_data['Embarked'].map(mapping_embarked)


In [10]:
try:
    test_data.drop(columns=dropped_columns,inplace=True)
except Exception as e:
    print("Exception happened: ", e)

In [11]:
print(test_data.info())
print(test_data['Sex'].unique())
print(test_data['Embarked'].unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 62.8 KB
None
[0 1]
[1 2 0]


# Random Forest

In [12]:
columns = test_data.columns
Y_column = 'Survived' # Variable dependiente 
X_columns = [col for col in columns if col != Y_column] # Variables independientes

In [13]:
X = test_data[X_columns]
y = test_data[Y_column]

Random Forest 1

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=17, test_size=0.2)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf.score(X_test, y_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       106
           1       0.74      0.71      0.73        73

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



In [15]:
# Relación entre variables
features = pd.DataFrame(rf.feature_importances_,index=X.columns)
features.head(10)

Unnamed: 0,0
PassengerId,0.191822
Pclass,0.081678
Sex,0.24349
Age,0.17882
SibSp,0.041933
Parch,0.033269
Fare,0.190778
Embarked,0.038211


Random Forest 2

In [16]:
rf2 = RandomForestClassifier(n_estimators=1000, criterion='entropy',min_samples_split=10,max_depth=14,random_state=42)
rf2.fit(X_train,y_train)
rf2.score(X_test,y_test)
y_pred2 = rf2.predict(X_test)
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.79      0.84      0.82       106
           1       0.75      0.68      0.71        73

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.77       179
weighted avg       0.77      0.78      0.77       179

