In [107]:
import math
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from pandas.core.dtypes.common import is_string_dtype, is_numeric_dtype

# 1. Kullanacağınız veriyi indirip, okumak

In [108]:
df = pd.read_csv('Stars.csv')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,Temperature,L,R,A_M,Color,Spectral_Class,Type
0,3068,0.0024,0.17,16.12,Red,M,0
1,3042,0.0005,0.1542,16.6,Red,M,0
2,2600,0.0003,0.102,18.7,Red,M,0
3,2800,0.0002,0.16,16.65,Red,M,0
4,1939,0.000138,0.103,20.06,Red,M,0


In [109]:
df_copy.describe()

Unnamed: 0,Temperature,L,R,A_M,Type
count,240.0,240.0,240.0,240.0,240.0
mean,10497.4625,107188.361635,237.157781,4.382396,2.5
std,9552.425037,179432.24494,517.155763,10.532512,1.711394
min,1939.0,8e-05,0.0084,-11.92,0.0
25%,3344.25,0.000865,0.10275,-6.2325,1.0
50%,5776.0,0.0705,0.7625,8.313,2.5
75%,15055.5,198050.0,42.75,13.6975,4.0
max,40000.0,849420.0,1948.5,20.06,5.0


In [110]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature     240 non-null    int64  
 1   L               240 non-null    float64
 2   R               240 non-null    float64
 3   A_M             240 non-null    float64
 4   Color           240 non-null    object 
 5   Spectral_Class  240 non-null    object 
 6   Type            240 non-null    int64  
dtypes: float64(3), int64(2), object(2)
memory usage: 13.2+ KB


# 2. Verinizin içindeki eksik ve kategorik değişkenler ile ilgilenip modele besleyeceğimiz hale getirmek 


In [111]:
df_copy.isnull().sum() #hiçbirinde boş yok fakat boş olsaydı

Temperature       0
L                 0
R                 0
A_M               0
Color             0
Spectral_Class    0
Type              0
dtype: int64

In [112]:
df_copy['Color'].unique()

array(['Red', 'Blue White', 'White', 'Yellowish White', 'Blue white',
       'Pale yellow orange', 'Blue', 'Blue-white', 'Whitish',
       'yellow-white', 'Orange', 'White-Yellow', 'white', 'yellowish',
       'Yellowish', 'Orange-Red', 'Blue-White'], dtype=object)

In [113]:
df_copy['Spectral_Class'].unique()

array(['M', 'B', 'A', 'F', 'O', 'K', 'G'], dtype=object)

In [114]:
df_copy['Type'].unique()

array([0, 1, 2, 3, 4, 5])

In [115]:
print(df.Color.value_counts())
print(f"{df.Color.value_counts().shape[0]} Çeşit Yıldız Rengi Vardır")

Red                   112
Blue                   56
Blue-white             26
Blue White             10
yellow-white            8
White                   7
Blue white              4
white                   3
Yellowish White         3
yellowish               2
Whitish                 2
Orange                  2
White-Yellow            1
Pale yellow orange      1
Yellowish               1
Orange-Red              1
Blue-White              1
Name: Color, dtype: int64
17 Çeşit Yıldız Rengi Vardır


In [116]:
print(df.Spectral_Class.value_counts())
print(f"{df.Spectral_Class.value_counts().shape[0]} Çeşit Yıldız Türü Vardır")

M    111
B     46
O     40
A     19
F     17
K      6
G      1
Name: Spectral_Class, dtype: int64
7 Çeşit Yıldız Türü Vardır


In [117]:
def train_cats(df):
  for n,c in df.items():
    if is_string_dtype(c):
      df[n] = c.astype("category").cat.as_ordered()

In [118]:
train_cats(df_copy)

In [119]:
def numericalize(df, col, name):
  if not is_numeric_dtype(col):
    df[name] = col.cat.codes + 1

In [120]:
numericalize(df_copy, df_copy['Spectral_Class'], 'Spectral_Class')
numericalize(df_copy, df_copy['Color'], 'Color')

In [121]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Temperature     240 non-null    int64  
 1   L               240 non-null    float64
 2   R               240 non-null    float64
 3   A_M             240 non-null    float64
 4   Color           240 non-null    int8   
 5   Spectral_Class  240 non-null    int8   
 6   Type            240 non-null    int64  
dtypes: float64(3), int64(2), int8(2)
memory usage: 10.0 KB


# 3.İlgilendiğiniz probleme göre error metriğine karar vermek

In [122]:
#incelediğimiz çıktı error'un miktarı olduğundan dolayı rmse kullancağız.

In [123]:
def rmse(y_hat, y):
  return math.sqrt(((y_hat-y)**2).mean())

# 4. Verinizi train-validation-test diye bölmek

In [124]:
def split_train_val(df, n):
  return df[:n].copy(), df[n:].copy()

In [125]:
n_valid = 50
n_train = len(df_copy) - n_valid

raw_train, raw_valid = split_train_val(df_copy, n_train)

In [126]:
X_train, y_train = raw_train.drop('Type', axis = 1), raw_train['Type']
X_valid, y_valid = raw_valid.drop('Type', axis = 1), raw_valid['Type']

In [127]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

(190, 6) (190,)
(50, 6) (50,)


# 5. Olabildiğince fazla model denemek ve metriğimizde en iyi yapanı seçmek


In [128]:
model = RandomForestRegressor(n_estimators=10, n_jobs=-1)

In [129]:
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, n_jobs=-1)

In [137]:
print('Train')
print('---------------------')
print(f'f-score:{model.score(X_train, y_train).round(4)}  RMSE:{rmse(model.predict(X_train), y_train)}')
print('---------------------')
print('Valid')
print('---------------------')
print(f'f-score:{model.score(X_valid, y_valid).round(4)}  RMSE:{rmse(model.predict(X_valid), y_valid)}')

Train
---------------------
f-score:0.9988  RMSE:0.061558701125109236
---------------------
Valid
---------------------
f-score:0.9996  RMSE:0.028284271247461926
