# Разбиение на тестовую и обучающую выборку в Python

Задача:  разбить набо данных на обучающую и тестовую выборки

In [None]:
# Load relevant libraries.

%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.api import abline_plot
import patsy
import seaborn as sns
sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', font_scale=1, rc=None)
import sklearn as skl

In [None]:
# Spam database
target_url = "https://datahub.io/machine-learning/spambase/r/spambase.csv"

spam = pd.read_csv(target_url)
print(spam.info())
print(spam['class'].describe())

In [None]:
spam.head()

## Вариант решения

### Использовать **pandas**

In [None]:
spamtrain = spam.sample(frac = 0.67, random_state = 1066)
spamtest = spam.drop(spamtrain.index)

# Confirm data has been split properly.
print(spamtrain['class'].count())
print(spamtest['class'].count())
print(spam['class'].count())

## Вариант решения

### Использовать **train_test_split** из **sklearn.model_selection**

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("north_korea_missile_test_database.csv")
y = df["Missile Name"]
X = df.drop("Missile Name", axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=31
)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.25, random_state=31
)

In [None]:
len(X_train)

In [None]:
len(X_val)

In [None]:
len(X_test)

In [None]:
print(len(X_train))
print(len(y_train))
print(len(X_val))
print(len(y_val))
print(len(X_test))
print(len(y_test))

# Задания:
1. Подготовить pandas dataframe на основе "сырых" данных - https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/ **spambase.data spambase.names**
2. Провести его анализ на предмет сбалансированности классов.
3. Произвести разбиение на тестовую обучающую выборку с использованием https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.htmlhttps://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html **sklearn.model_selection.StratifiedShuffleSplit** в соотношении **80/20**, **70/30**
