# 5 Parameters used in Decision Tree Model

In [1]:
from sklearn import tree
tree.DecisionTreeClassifier()       
# will create DecisionTreeClassifier() class object

DecisionTreeClassifier()

# Data Cleaning Methods

### 1: Check for missing values

In [2]:
import pandas as pd
df = pd.read_csv('pokemon_data.csv')
df.head(3)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False


In [3]:
df.shape

(800, 12)

In [4]:
df.isnull().sum() 
# as we analyse that Type 2 column have null values 

#               0
Name            0
Type 1          0
Type 2        386
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

### 2: Handling null values

In [5]:
df = df.fillna(method='ffill')
df.isnull().sum()
# null values got fill by forward fill method

#             0
Name          0
Type 1        0
Type 2        0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64

### 3: Dropping column

In [6]:
a = df.drop(['Type 2', '#'], axis=1)
a.head(3)
# We drop 'Type 2' and '#' column

Unnamed: 0,Name,Type 1,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,45,49,49,65,65,45,1,False
1,Ivysaur,Grass,60,62,63,80,80,60,1,False
2,Venusaur,Grass,80,82,83,100,100,80,1,False


# Obtaining 90% accuracy from train datasets

In [7]:
import pandas as pd
train_data = pd.read_csv('train.csv')
train_data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


#### Data Cleaning

In [8]:
clean_data = train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
clean_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
clean_data['Sex'] = le.fit_transform(clean_data['Sex'])
clean_data.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925


In [10]:
def myfun(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [11]:
clean_data['Age'] = clean_data[['Age', 'Pclass']].apply(myfun,axis=1)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(4)
memory usage: 45.4 KB


In [12]:
inputs = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
outputs = ['Survived']

x = clean_data[inputs]
y = clean_data[outputs]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.333, random_state=4)

In [49]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=4, splitter='best')

In [50]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [51]:
from sklearn import metrics
a=metrics.accuracy_score(y_pred, y_test)
a

0.8720538720538721