<a href="https://colab.research.google.com/github/VorobyvEgor/Seminar_Sber/blob/main/Seminars/%D0%97%D0%B0%D0%BD%D1%8F%D1%82%D0%B8%D0%B5_10_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

%matplotlib inline

Пройдем весь путь от загрузки данных до создания файла с ответами для датасета Титаник: https://www.kaggle.com/c/titanic/data?select=train.csv

1. Загрузка данных

In [None]:
! wget https://www.dropbox.com/s/5dedxy5zeydon9m/test.csv
! wget https://www.dropbox.com/s/tnjuesm1uiquxe2/train.csv

In [None]:
trd = pd.read_csv('train.csv')
tsd = pd.read_csv('test.csv')
td = pd.concat([trd, tsd], ignore_index=True, sort = False)
td = td.set_index('PassengerId')
print(td.shape)
td.head()

2. Пропущенные значения

In [None]:
# td.isnull().sum()
sns.heatmap(td.isnull(), cbar = False).set_title("Missing values heatmap")

3. Категориальные признаки

Далее, чтобы определить категориальные признаки, можно посмотреть на количество уникальных значений в каждом столбце. Признаки «Sex» и «Survived» имеют два возможных значения, а «Embarked» и «Pclass» имели три возможных значения.

In [None]:
td.nunique()

4. Новые признаки

In [None]:
td['Family'] = td.Parch + td.SibSp
td['Is_Alone'] = td.Family == 0
td['Fare_Category'] = pd.cut(td['Fare'], bins=[0,7.90,14.45,31.28,120], labels=['Low','Mid', 'High_Mid','High'])

5. Заполнение пропусков

In [None]:
td.Embarked.fillna(td.Embarked.mode()[0], inplace = True)
td.Cabin = td.Cabin.fillna('NA')

In [None]:
td['Age'] = td['Age'].fillna(td['Age'].median())

6. Кодирование категориальных переменных

In [None]:
td['Sex'] = LabelEncoder().fit_transform(td['Sex'])

In [None]:
pd.get_dummies(td.Embarked, prefix="Emb", drop_first = True)

In [None]:
td = pd.concat([td, pd.get_dummies(td.Embarked, prefix="Emb", drop_first = True)], axis=1)
print(td.shape)
td.head()

7. Удаляем лишние колонки

In [None]:
td.drop(['Pclass', 'Fare','Cabin', 'Fare_Category','Name', 'Ticket','Embarked', 'SibSp', 'Parch', 'Age'], axis=1, inplace=True)
print(td.shape)
td.head()

8. Формируем датасеты для train \ test \ submit

In [None]:
# Датасет на котором делаем предсказания
X_to_be_predicted = td[td.Survived.isnull()]
X_to_be_predicted = X_to_be_predicted.drop(['Survived'], axis = 1)

In [None]:
#Training data
train_data = td
train_data = train_data.dropna()
label_train = train_data['Survived']
feature_train = train_data.drop(['Survived'], axis = 1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(feature_train, label_train, test_size=0.2)

9. Масштабирование признаков

In [None]:
normalizer = MinMaxScaler()
normalizer.fit(x_train)

x_train_transformed = normalizer.transform(x_train)
x_test_transformed = normalizer.transform(x_test)
X_to_be_predicted_transformed = normalizer.transform(X_to_be_predicted)

10. Строим модель

In [None]:
# Logistic Regression
clf = LogisticRegression()
clf.fit(x_train_transformed, y_train)
print("Accuracy: "+repr(round(clf.score(x_test_transformed, y_test) * 100, 2)) + "%")

cvs = cross_val_score(clf, x_train_transformed, y_train, cv=10, scoring='accuracy')
print('The cross validated score for Random forest is:',round(cvs.mean()*100,2))

y_pred = cross_val_predict(clf, x_train_transformed, y_train, cv=10)
sns.heatmap(confusion_matrix(y_train, y_pred), annot=True, fmt='3.0f', cmap="summer")
plt.title('Confusion_matrix', y=1.05, size=15)

11. Создание файла с прогнозами

In [None]:
result = clf.predict(X_to_be_predicted_transformed)
submission = pd.DataFrame({'PassengerId':X_to_be_predicted.index,'Survived':result})
submission.Survived = submission.Survived.astype(int)
print(submission.shape)
submission.head()

In [None]:
filename = 'Titanic Predictions.csv'
submission.to_csv(filename,index=False)

#### SVM
Попробуем использовать вместо logreg 

In [None]:
from sklearn.svm import SVC

In [None]:
# SVM
clf = SVC()
clf.fit(x_train_transformed, y_train)
print("Accuracy: "+repr(round(clf.score(x_test_transformed, y_test) * 100, 2)) + "%")

cvs = cross_val_score(clf, x_train_transformed, y_train, cv=10, scoring='accuracy')
print('The cross validated score for Random forest is:',round(cvs.mean()*100,2))

y_pred = cross_val_predict(clf, x_train_transformed, y_train, cv=10)
sns.heatmap(confusion_matrix(y_train, y_pred), annot=True, fmt='3.0f', cmap="summer")
plt.title('Confusion_matrix', y=1.05, size=15)

In [None]:
result = clf.predict(X_to_be_predicted_transformed)
submission = pd.DataFrame({'PassengerId':X_to_be_predicted.index,'Survived':result})
submission.Survived = submission.Survived.astype(int)
print(submission.shape)
submission.head()

In [None]:
filename = 'Titanic Predictions SVM.csv'
submission.to_csv(filename,index=False)