## Import

In [539]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Load data
### Загрузка данных и создание датафрейма

In [540]:
data = pd.read_csv('diamonds (cleaned) (1).csv')
df = pd.DataFrame(data)

## Cleaning
###  Очистка данных 
#### Проверка на нулевые значения ---> Удалил коллонки Cut, Culet, Fluorescence, так как в них было более 80% нулевых значений ---> Удалил строки  с нулевыми значениями Height.

In [541]:
df.isna().sum()

Shape                    5
Cut                   4303
Color                    5
Clarity                  5
Carat Weight             5
Length/Width Ratio       5
Depth %                  7
Table %                 17
Polish                  20
Symmetry                20
Girdle                  22
Culet                 4594
Length                  20
Width                   20
Height                  20
Price                    0
Type                     0
Fluorescence          5264
dtype: int64

In [542]:
df.drop(['Cut','Culet','Fluorescence'],axis = 1,inplace = True)

In [543]:
df.isna().sum()

Shape                  5
Color                  5
Clarity                5
Carat Weight           5
Length/Width Ratio     5
Depth %                7
Table %               17
Polish                20
Symmetry              20
Girdle                22
Length                20
Width                 20
Height                20
Price                  0
Type                   0
dtype: int64

In [544]:
df.dtypes

Shape                  object
Color                  object
Clarity                object
Carat Weight          float64
Length/Width Ratio    float64
Depth %               float64
Table %               float64
Polish                 object
Symmetry               object
Girdle                 object
Length                float64
Width                 float64
Height                float64
Price                   int64
Type                   object
dtype: object

In [545]:
df = df.drop(df[df['Height'].isna()].index.tolist())

In [546]:
df = df.drop(df[df['Girdle'].isna()].index.tolist())

In [547]:
df.isnull().sum()

Shape                 0
Color                 0
Clarity               0
Carat Weight          0
Length/Width Ratio    0
Depth %               0
Table %               0
Polish                0
Symmetry              0
Girdle                0
Length                0
Width                 0
Height                0
Price                 0
Type                  0
dtype: int64

## Encoding
### Декодировал строки [Symmetry, Color, Clarity, Type, Grigle, Polish] в int значения, с помощью LabelEncoder

In [548]:
label_encoder = LabelEncoder()
df['Symmetry'] = label_encoder.fit_transform(df['Symmetry'])
df['Color'] = label_encoder.fit_transform(df['Color'])
df['Clarity'] = label_encoder.fit_transform(df['Clarity'])
df['Type'] = label_encoder.fit_transform(df['Type'])
df['Girdle'] = label_encoder.fit_transform(df['Girdle'])
df['Polish'] = label_encoder.fit_transform(df['Polish'])

df.dtypes

Shape                  object
Color                   int64
Clarity                 int64
Carat Weight          float64
Length/Width Ratio    float64
Depth %               float64
Table %               float64
Polish                  int64
Symmetry                int64
Girdle                  int64
Length                float64
Width                 float64
Height                float64
Price                   int64
Type                    int64
dtype: object

In [549]:
df

Unnamed: 0,Shape,Color,Clarity,Carat Weight,Length/Width Ratio,Depth %,Table %,Polish,Symmetry,Girdle,Length,Width,Height,Price,Type
0,Cushion Modified,2,5,1.84,1.02,65.8,59.0,0,2,7,7.09,6.95,4.57,2640,1
1,Pear,1,2,1.20,1.65,62.5,58.0,2,2,7,9.64,5.86,3.66,1070,1
2,Oval,1,2,1.19,1.41,63.1,63.0,2,2,7,8.44,6.00,3.79,1070,1
3,Heart,0,1,1.00,1.18,61.7,58.0,0,0,13,5.85,6.89,4.25,7110,0
4,Radiant,4,2,1.01,1.35,69.4,66.0,0,2,14,6.80,5.05,3.50,3050,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6480,Pear,2,3,1.02,1.54,62.9,63.0,0,2,16,8.60,5.59,3.51,3760,0
6481,Heart,1,5,1.00,1.16,58.3,59.0,0,0,13,6.05,6.99,4.07,4710,0
6482,Princess,3,4,1.03,1.01,72.6,71.0,0,0,6,5.59,5.51,4.00,3270,0
6483,Radiant,1,2,1.06,1.43,67.8,62.0,0,0,12,7.24,5.05,3.42,1050,1


## Model
## Построение модели
#### Использовал модель DecisionTreeClassifier
#### В первом случае разделял тестовую и обучающую выборки вручную, во втором с помощью train_test_split


In [550]:
#Создавал вручную тестовую и обучающую выборки
# predicted_df = df.drop(df.index[:6400])
# predicted_df = predicted_df.drop(columns = 'Shape')
# df.drop(df.index[6400:],inplace = True)
# X = df.drop(columns = 'Shape')
# X.head(5)
# y = df['Shape']
# y.head(5)
# model = DecisionTreeClassifier()
# model.fit(X,y)
# model.predict(predicted_df)

In [551]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
y_train

3153        Oval
4444       Round
3423        Oval
809     Princess
5119    Princess
          ...   
4489    Princess
5260     Radiant
557        Round
172        Round
1289    Marquise
Name: Shape, Length: 5120, dtype: object

In [552]:
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

## Prediction
### Предсказание на основе тестовой выборки X_test

In [553]:
predictions = model.predict(X_test)
predictions

array(['Round', 'Pear', 'Cushion Modified', ..., 'Heart', 'Pear', 'Oval'],
      shape=(1280,), dtype=object)

## Accuracy 
### Измерение точности модели
## Точность ≈ 95%

In [554]:
model_accuracy_score = accuracy_score(y_test,predictions)
model_accuracy_score

0.94453125