In [239]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [152]:
df = pd.read_csv("penguins_size.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [153]:
df.isna().any()

species              False
island               False
culmen_length_mm      True
culmen_depth_mm       True
flipper_length_mm     True
body_mass_g           True
sex                   True
dtype: bool

In [154]:
df.isna().sum() #na değerlerimiz az olduğu için direkt olarak droplayabiliriz.

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [155]:
df = df.dropna()
df.isna().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [156]:
df.nunique() #sex için 3 farklı değer var 2 olması lazım normalde

species                3
island                 3
culmen_length_mm     163
culmen_depth_mm       79
flipper_length_mm     54
body_mass_g           93
sex                    3
dtype: int64

In [157]:
df.sex.value_counts() # . diye bir değer gözüküyor bunu da dropluyorum

MALE      168
FEMALE    165
.           1
Name: sex, dtype: int64

In [158]:
df.index[df['sex']=='.']

Int64Index([336], dtype='int64')

In [159]:
df=df.drop(df.index[df['sex'] == '.'])
df.sex.value_counts()
df=df.reset_index()

In [160]:
df['sex'] = df['sex'].replace(['MALE','FEMALE'],[1,0])

In [161]:
X = pd.get_dummies(df.drop('sex',axis=1),drop_first=True)
y = df['sex']
#categorical variableları encode ediyorum. drop_first=True yaparak correlation olmasını engelliyorum.
#örnek: sex_female=0 ise sex_male=1 olmak zorunda zaten. veya 3 categorical değer varsa 2 si değilse zaten 3. değer=1 olmak zorunda

In [162]:
X

Unnamed: 0,index,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,species_Chinstrap,species_Gentoo,island_Dream,island_Torgersen
0,0,39.1,18.7,181.0,3750.0,0,0,0,1
1,1,39.5,17.4,186.0,3800.0,0,0,0,1
2,2,40.3,18.0,195.0,3250.0,0,0,0,1
3,4,36.7,19.3,193.0,3450.0,0,0,0,1
4,5,39.3,20.6,190.0,3650.0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
328,338,47.2,13.7,214.0,4925.0,0,1,0,0
329,340,46.8,14.3,215.0,4850.0,0,1,0,0
330,341,50.4,15.7,222.0,5750.0,0,1,0,0
331,342,45.2,14.8,212.0,5200.0,0,1,0,0


#### Base Model

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [169]:
model_base = RandomForestClassifier(n_estimators=10, bootstrap=False,random_state=42)

In [178]:
scores_base =cross_val_score(model_base,X_train,y_train,scoring='accuracy',cv=5)
#kendi içinde cv yaparken x train ve y traini validation ve training olarak tekrar bölüyor zaten.
#train validate test splitlerimiz oluşmuş oluyor.

In [246]:
base_accuracy= scores_base.mean()
base_accuracy
#base modelimiz ortalama %88 r2 değeri verdi

0.8870020964360587

#### Tuned Model

In [208]:
model_tuned = RandomForestClassifier(random_state=42)

In [230]:
grid_params = {'criterion': ['entropy', 'gini'],
               'max_depth': [1,3,5] + [None],
               'max_features': ['auto', 'sqrt','log2', None],
               'min_samples_leaf': [2,8],
               'min_samples_split': [2,5],
               'n_estimators': [50,100]}


In [231]:
grid_model = GridSearchCV(estimator=model_tuned,
                          param_grid=grid_params,
                          scoring='accuracy',
                          cv=5,
                          verbose=1)

In [232]:
grid_model.fit(X_train,y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 3, 5, None],
                         'max_features': ['auto', 'sqrt', 'log2', None],
                         'min_samples_leaf': [2, 8],
                         'min_samples_split': [2, 5],
                         'n_estimators': [50, 100]},
             scoring='accuracy', verbose=1)

In [254]:
grid_model.best_estimator_.get_params()
#aradığımız parametrelerden en iyi sonucu veren kombinasyonu listeler

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [249]:
y_pred = grid_model.best_estimator_.predict(X_train)
accuracy_score(y_pred,y_train)
#önceden %88 iken şimdi %95 e yükselttik
#artık test setine uygulayabiliriz

0.9511278195488722

In [251]:
y_pred_final = grid_model.predict(X_test) #test setine uyguluyoruz

In [252]:
accuracy_score(y_test, y_pred_final)
#hiç görmediği test setinde %86 r2 değerine ulaştık

0.8656716417910447